Created
January 24, 2025 13:39
-
-
Save pashu123/6602608de9a8a8f0c29779b6d612a4cc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, 0] [4, 4096] [1, 1] : tensor<32000x4096xf16> to tensor<4x4096xf16> | |
%extracted_slice_0 = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice_0 : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %extracted_slice : tensor<1x4096xf16>, tensor<4x4096xf16>) outs(%7 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_1: f16, %out: f16): | |
%9 = arith.mulf %in, %in_1 : f16 | |
%10 = arith.addf %out, %9 : f16 | |
linalg.yield %10 : f16 | |
} -> tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, 0] [4, 4096] [1, 1] : tensor<32000x4096xf16> to tensor<4x4096xf16> | |
%extracted_slice_0 = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice_0 : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %extracted_slice : tensor<1x4096xf16>, tensor<4x4096xf16>) outs(%7 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_1: f16, %out: f16): | |
%9 = arith.mulf %in, %in_1 : f16 | |
%10 = arith.addf %out, %9 : f16 | |
linalg.yield %10 : f16 | |
} -> tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, 0] [4, 4096] [1, 1] : tensor<32000x4096xf16> to tensor<4x4096xf16> | |
%extracted_slice_0 = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice_0 : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %extracted_slice : tensor<1x4096xf16>, tensor<4x4096xf16>) outs(%7 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_1: f16, %out: f16): | |
%9 = arith.mulf %in, %in_1 : f16 | |
%10 = arith.addf %out, %9 : f16 | |
linalg.yield %10 : f16 | |
} -> tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After ConvertAttentionToOnlineAttentionPass (iree-linalg-ext-convert-attention-to-online-attention) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, 0] [4, 4096] [1, 1] : tensor<32000x4096xf16> to tensor<4x4096xf16> | |
%extracted_slice_0 = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice_0 : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %extracted_slice : tensor<1x4096xf16>, tensor<4x4096xf16>) outs(%7 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_1: f16, %out: f16): | |
%9 = arith.mulf %in, %in_1 : f16 | |
%10 = arith.addf %out, %9 : f16 | |
linalg.yield %10 : f16 | |
} -> tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, 0] [4, 4096] [1, 1] : tensor<32000x4096xf16> to tensor<4x4096xf16> | |
%extracted_slice_0 = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice_0 : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %extracted_slice : tensor<1x4096xf16>, tensor<4x4096xf16>) outs(%7 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_1: f16, %out: f16): | |
%9 = arith.mulf %in, %in_1 : f16 | |
%10 = arith.addf %out, %9 : f16 | |
linalg.yield %10 : f16 | |
} -> tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, 0] [4, 4096] [1, 1] : tensor<32000x4096xf16> to tensor<4x4096xf16> | |
%extracted_slice_0 = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice_0 : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %extracted_slice : tensor<1x4096xf16>, tensor<4x4096xf16>) outs(%7 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_1: f16, %out: f16): | |
%9 = arith.mulf %in, %in_1 : f16 | |
%10 = arith.addf %out, %9 : f16 | |
linalg.yield %10 : f16 | |
} -> tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After GPUPromoteMatmulOperandsPass (iree-codegen-gpu-promote-matmul-operands) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %4[%arg0, 0] [4, 4096] [1, 1] : tensor<32000x4096xf16> to tensor<4x4096xf16> | |
%extracted_slice_0 = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice_0 : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %extracted_slice : tensor<1x4096xf16>, tensor<4x4096xf16>) outs(%7 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_1: f16, %out: f16): | |
%9 = arith.mulf %in, %in_1 : f16 | |
%10 = arith.addf %out, %9 : f16 | |
linalg.yield %10 : f16 | |
} -> tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After GPUApplyTilingLevelPass (iree-codegen-gpu-apply-tiling-level) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<1x4xf16>) { | |
%extracted_slice_0 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x128xf16>, tensor<4x128xf16>) outs(%arg3 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_2: f16, %out: f16): | |
%10 = arith.mulf %in, %in_2 : f16 | |
%11 = arith.addf %out, %10 : f16 | |
linalg.yield %11 : f16 | |
} -> tensor<1x4xf16> | |
scf.yield %9 : tensor<1x4xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After LoopCoalescing (affine-loop-coalescing) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<1x4xf16>) { | |
%extracted_slice_0 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x128xf16>, tensor<4x128xf16>) outs(%arg3 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_2: f16, %out: f16): | |
%10 = arith.mulf %in, %in_2 : f16 | |
%11 = arith.addf %out, %10 : f16 | |
linalg.yield %11 : f16 | |
} -> tensor<1x4xf16> | |
scf.yield %9 : tensor<1x4xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<1x4xf16>) { | |
%extracted_slice_0 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x128xf16>, tensor<4x128xf16>) outs(%arg3 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_2: f16, %out: f16): | |
%10 = arith.mulf %in, %in_2 : f16 | |
%11 = arith.addf %out, %10 : f16 | |
linalg.yield %11 : f16 | |
} -> tensor<1x4xf16> | |
scf.yield %9 : tensor<1x4xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<1x4xf16>) { | |
%extracted_slice_0 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x128xf16>, tensor<4x128xf16>) outs(%arg3 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_2: f16, %out: f16): | |
%10 = arith.mulf %in, %in_2 : f16 | |
%11 = arith.addf %out, %10 : f16 | |
linalg.yield %11 : f16 | |
} -> tensor<1x4xf16> | |
scf.yield %9 : tensor<1x4xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After DecomposeAttentionPass (iree-linalg-ext-decompose-attention) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<1x4xf16>) { | |
%extracted_slice_0 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x128xf16>, tensor<4x128xf16>) outs(%arg3 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_2: f16, %out: f16): | |
%10 = arith.mulf %in, %in_2 : f16 | |
%11 = arith.addf %out, %10 : f16 | |
linalg.yield %11 : f16 | |
} -> tensor<1x4xf16> | |
scf.yield %9 : tensor<1x4xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<1x4xf16>) { | |
%extracted_slice_0 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x128xf16>, tensor<4x128xf16>) outs(%arg3 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_2: f16, %out: f16): | |
%10 = arith.mulf %in, %in_2 : f16 | |
%11 = arith.addf %out, %10 : f16 | |
linalg.yield %11 : f16 | |
} -> tensor<1x4xf16> | |
scf.yield %9 : tensor<1x4xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<1x4xf16>) { | |
%extracted_slice_0 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x128xf16>, tensor<4x128xf16>) outs(%arg3 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_2: f16, %out: f16): | |
%10 = arith.mulf %in, %in_2 : f16 | |
%11 = arith.addf %out, %10 : f16 | |
linalg.yield %11 : f16 | |
} -> tensor<1x4xf16> | |
scf.yield %9 : tensor<1x4xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After LLVMGPUConfigureTensorLayoutsPass (iree-llvmgpu-configure-tensor-layouts) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<1x4xf16>) { | |
%extracted_slice_0 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%9 = iree_vector_ext.to_layout %extracted_slice_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [1, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [0, 1]>) : tensor<1x128xf16> | |
%10 = iree_vector_ext.to_layout %extracted_slice_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<4x128xf16> | |
%11 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [1, 4], element_tile = [1, 1], subgroup_strides = [0, 0], thread_strides = [0, 16]>) : tensor<1x4xf16> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%9, %10 : tensor<1x128xf16>, tensor<4x128xf16>) outs(%11 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_2: f16, %out: f16): | |
%14 = arith.mulf %in, %in_2 : f16 | |
%15 = arith.addf %out, %14 : f16 | |
linalg.yield %15 : f16 | |
} -> tensor<1x4xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [1, 4], element_tile = [1, 1], subgroup_strides = [0, 0], thread_strides = [0, 16]>) : tensor<1x4xf16> | |
scf.yield %13 : tensor<1x4xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.fill ins(%cst : f16) outs(%extracted_slice : tensor<1x4xf16>) -> tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<1x4xf16>) { | |
%extracted_slice_0 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%9 = iree_vector_ext.to_layout %extracted_slice_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [1, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [0, 1]>) : tensor<1x128xf16> | |
%10 = iree_vector_ext.to_layout %extracted_slice_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<4x128xf16> | |
%11 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [1, 4], element_tile = [1, 1], subgroup_strides = [0, 0], thread_strides = [0, 16]>) : tensor<1x4xf16> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%9, %10 : tensor<1x128xf16>, tensor<4x128xf16>) outs(%11 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_2: f16, %out: f16): | |
%14 = arith.mulf %in, %in_2 : f16 | |
%15 = arith.addf %out, %14 : f16 | |
linalg.yield %15 : f16 | |
} -> tensor<1x4xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [1, 4], element_tile = [1, 1], subgroup_strides = [0, 0], thread_strides = [0, 16]>) : tensor<1x4xf16> | |
scf.yield %13 : tensor<1x4xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After LinalgGeneralizeNamedOpsPass (linalg-generalize-named-ops) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f16) outs(%extracted_slice : tensor<1x4xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<1x4xf16>) { | |
%extracted_slice_0 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%9 = iree_vector_ext.to_layout %extracted_slice_0 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [1, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [0, 1]>) : tensor<1x128xf16> | |
%10 = iree_vector_ext.to_layout %extracted_slice_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<4x128xf16> | |
%11 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [1, 4], element_tile = [1, 1], subgroup_strides = [0, 0], thread_strides = [0, 16]>) : tensor<1x4xf16> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%9, %10 : tensor<1x128xf16>, tensor<4x128xf16>) outs(%11 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_2: f16, %out: f16): | |
%14 = arith.mulf %in, %in_2 : f16 | |
%15 = arith.addf %out, %14 : f16 | |
linalg.yield %15 : f16 | |
} -> tensor<1x4xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [1, 4], element_tile = [1, 1], subgroup_strides = [0, 0], thread_strides = [0, 16]>) : tensor<1x4xf16> | |
scf.yield %13 : tensor<1x4xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After VectorExtFoldUnitExtentDimsPass (iree-vector-ext-fold-unit-extent-dims) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f16) outs(%extracted_slice : tensor<1x4xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<1x4xf16>) { | |
%extracted_slice_0 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_1 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%extracted_slice_2 = tensor.extract_slice %extracted_slice_0[0, 0] [1, 128] [1, 1] : tensor<1x128xf16> to tensor<128xf16> | |
%9 = iree_vector_ext.to_layout %extracted_slice_2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : tensor<128xf16> | |
%10 = tensor.empty() : tensor<1x128xf16> | |
%inserted_slice = tensor.insert_slice %9 into %10[0, 0] [1, 128] [1, 1] : tensor<128xf16> into tensor<1x128xf16> | |
%11 = iree_vector_ext.to_layout %extracted_slice_1 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<4x128xf16> | |
%extracted_slice_3 = tensor.extract_slice %arg3[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%12 = iree_vector_ext.to_layout %extracted_slice_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : tensor<4xf16> | |
%13 = tensor.empty() : tensor<1x4xf16> | |
%inserted_slice_4 = tensor.insert_slice %12 into %13[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%inserted_slice, %11 : tensor<1x128xf16>, tensor<4x128xf16>) outs(%inserted_slice_4 : tensor<1x4xf16>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 128], subgroup_basis = [[1, 1, 1], [0, 1, 2]], thread = [0, 0, 8], thread_basis = [[1, 4, 16], [0, 1, 2]], workgroup = [0, 4, 0]}>} { | |
^bb0(%in: f16, %in_7: f16, %out: f16): | |
%17 = arith.mulf %in, %in_7 : f16 | |
%18 = arith.addf %out, %17 : f16 | |
linalg.yield %18 : f16 | |
} -> tensor<1x4xf16> | |
%extracted_slice_5 = tensor.extract_slice %14[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%15 = iree_vector_ext.to_layout %extracted_slice_5 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : tensor<4xf16> | |
%16 = tensor.empty() : tensor<1x4xf16> | |
%inserted_slice_6 = tensor.insert_slice %15 into %16[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
scf.yield %inserted_slice_6 : tensor<1x4xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After LinalgFoldUnitExtentDimsPass (linalg-fold-unit-extent-dims) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst : f16) outs(%extracted_slice_0 : tensor<4xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<4xf16> | |
%inserted_slice = tensor.insert_slice %7 into %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %inserted_slice) -> (tensor<1x4xf16>) { | |
%extracted_slice_1 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_2 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0] [1, 128] [1, 1] : tensor<1x128xf16> to tensor<128xf16> | |
%9 = iree_vector_ext.to_layout %extracted_slice_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : tensor<128xf16> | |
%10 = iree_vector_ext.to_layout %extracted_slice_2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<4x128xf16> | |
%extracted_slice_4 = tensor.extract_slice %arg3[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%11 = iree_vector_ext.to_layout %extracted_slice_4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : tensor<4xf16> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%9, %10 : tensor<128xf16>, tensor<4x128xf16>) outs(%11 : tensor<4xf16>) { | |
^bb0(%in: f16, %in_6: f16, %out: f16): | |
%15 = arith.mulf %in, %in_6 : f16 | |
%16 = arith.addf %out, %15 : f16 | |
linalg.yield %16 : f16 | |
} -> tensor<4xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : tensor<4xf16> | |
%14 = tensor.empty() : tensor<1x4xf16> | |
%inserted_slice_5 = tensor.insert_slice %13 into %14[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
scf.yield %inserted_slice_5 : tensor<1x4xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst : f16) outs(%extracted_slice_0 : tensor<4xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<4xf16> | |
%inserted_slice = tensor.insert_slice %7 into %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %inserted_slice) -> (tensor<1x4xf16>) { | |
%extracted_slice_1 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_2 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0] [1, 128] [1, 1] : tensor<1x128xf16> to tensor<128xf16> | |
%9 = iree_vector_ext.to_layout %extracted_slice_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : tensor<128xf16> | |
%10 = iree_vector_ext.to_layout %extracted_slice_2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<4x128xf16> | |
%extracted_slice_4 = tensor.extract_slice %arg3[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%11 = iree_vector_ext.to_layout %extracted_slice_4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : tensor<4xf16> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%9, %10 : tensor<128xf16>, tensor<4x128xf16>) outs(%11 : tensor<4xf16>) { | |
^bb0(%in: f16, %in_6: f16, %out: f16): | |
%15 = arith.mulf %in, %in_6 : f16 | |
%16 = arith.addf %out, %15 : f16 | |
linalg.yield %16 : f16 | |
} -> tensor<4xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : tensor<4xf16> | |
%14 = tensor.empty() : tensor<1x4xf16> | |
%inserted_slice_5 = tensor.insert_slice %13 into %14[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
scf.yield %inserted_slice_5 : tensor<1x4xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst : f16) outs(%extracted_slice_0 : tensor<4xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<4xf16> | |
%inserted_slice = tensor.insert_slice %7 into %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %inserted_slice) -> (tensor<1x4xf16>) { | |
%extracted_slice_1 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_2 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice_1[0, 0] [1, 128] [1, 1] : tensor<1x128xf16> to tensor<128xf16> | |
%9 = iree_vector_ext.to_layout %extracted_slice_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : tensor<128xf16> | |
%10 = iree_vector_ext.to_layout %extracted_slice_2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<4x128xf16> | |
%extracted_slice_4 = tensor.extract_slice %arg3[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%11 = iree_vector_ext.to_layout %extracted_slice_4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : tensor<4xf16> | |
%12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%9, %10 : tensor<128xf16>, tensor<4x128xf16>) outs(%11 : tensor<4xf16>) { | |
^bb0(%in: f16, %in_6: f16, %out: f16): | |
%15 = arith.mulf %in, %in_6 : f16 | |
%16 = arith.addf %out, %15 : f16 | |
linalg.yield %16 : f16 | |
} -> tensor<4xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : tensor<4xf16> | |
%14 = tensor.empty() : tensor<1x4xf16> | |
%inserted_slice_5 = tensor.insert_slice %13 into %14[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
scf.yield %inserted_slice_5 : tensor<1x4xf16> | |
} | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst : f16) outs(%extracted_slice_0 : tensor<4xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<4xf16> | |
%8 = tensor.empty() : tensor<1x4xf16> | |
%9 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<4xf16>) { | |
%extracted_slice_1 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_2 = tensor.extract_slice %extracted_slice_1[0, 0] [1, 128] [1, 1] : tensor<1x128xf16> to tensor<128xf16> | |
%extracted_slice_3 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%10 = iree_vector_ext.to_layout %extracted_slice_2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : tensor<128xf16> | |
%11 = iree_vector_ext.to_layout %extracted_slice_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<4x128xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : tensor<4xf16> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%10, %11 : tensor<128xf16>, tensor<4x128xf16>) outs(%12 : tensor<4xf16>) { | |
^bb0(%in: f16, %in_4: f16, %out: f16): | |
%15 = arith.mulf %in, %in_4 : f16 | |
%16 = arith.addf %out, %15 : f16 | |
linalg.yield %16 : f16 | |
} -> tensor<4xf16> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : tensor<4xf16> | |
scf.yield %14 : tensor<4xf16> | |
} | |
%inserted_slice = tensor.insert_slice %9 into %8[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %inserted_slice into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst : f16) outs(%extracted_slice_0 : tensor<4xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<4xf16> | |
%8 = tensor.empty() : tensor<1x4xf16> | |
%9 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<4xf16>) { | |
%extracted_slice_1 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_2 = tensor.extract_slice %extracted_slice_1[0, 0] [1, 128] [1, 1] : tensor<1x128xf16> to tensor<128xf16> | |
%extracted_slice_3 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%10 = iree_vector_ext.to_layout %extracted_slice_2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : tensor<128xf16> | |
%11 = iree_vector_ext.to_layout %extracted_slice_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<4x128xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : tensor<4xf16> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%10, %11 : tensor<128xf16>, tensor<4x128xf16>) outs(%12 : tensor<4xf16>) { | |
^bb0(%in: f16, %in_4: f16, %out: f16): | |
%15 = arith.mulf %in, %in_4 : f16 | |
%16 = arith.addf %out, %15 : f16 | |
linalg.yield %16 : f16 | |
} -> tensor<4xf16> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : tensor<4xf16> | |
scf.yield %14 : tensor<4xf16> | |
} | |
%inserted_slice = tensor.insert_slice %9 into %8[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %inserted_slice into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After DecomposeIm2colPass (iree-linalg-ext-decompose-im2col) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst : f16) outs(%extracted_slice_0 : tensor<4xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<4xf16> | |
%8 = tensor.empty() : tensor<1x4xf16> | |
%9 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<4xf16>) { | |
%extracted_slice_1 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_2 = tensor.extract_slice %extracted_slice_1[0, 0] [1, 128] [1, 1] : tensor<1x128xf16> to tensor<128xf16> | |
%extracted_slice_3 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%10 = iree_vector_ext.to_layout %extracted_slice_2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : tensor<128xf16> | |
%11 = iree_vector_ext.to_layout %extracted_slice_3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : tensor<4x128xf16> | |
%12 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : tensor<4xf16> | |
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%10, %11 : tensor<128xf16>, tensor<4x128xf16>) outs(%12 : tensor<4xf16>) { | |
^bb0(%in: f16, %in_4: f16, %out: f16): | |
%15 = arith.mulf %in, %in_4 : f16 | |
%16 = arith.addf %out, %15 : f16 | |
linalg.yield %16 : f16 | |
} -> tensor<4xf16> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : tensor<4xf16> | |
scf.yield %14 : tensor<4xf16> | |
} | |
%inserted_slice = tensor.insert_slice %9 into %8[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %inserted_slice into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After VectorizeIREEVectorExtOpsPass (iree-vector-ext-vectorize-ops) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%extracted_slice_0 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%7 = linalg.generic {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%cst : f16) outs(%extracted_slice_0 : tensor<4xf16>) { | |
^bb0(%in: f16, %out: f16): | |
linalg.yield %in : f16 | |
} -> tensor<4xf16> | |
%8 = tensor.empty() : tensor<1x4xf16> | |
%9 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<4xf16>) { | |
%extracted_slice_1 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_2 = tensor.extract_slice %extracted_slice_1[0, 0] [1, 128] [1, 1] : tensor<1x128xf16> to tensor<128xf16> | |
%extracted_slice_3 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%10 = vector.transfer_read %extracted_slice_2[%c0], %cst {in_bounds = [true]} : tensor<128xf16>, vector<128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%12 = tensor.empty() : tensor<128xf16> | |
%13 = vector.transfer_write %11, %12[%c0] {in_bounds = [true]} : vector<128xf16>, tensor<128xf16> | |
%14 = vector.transfer_read %extracted_slice_3[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<4x128xf16>, vector<4x128xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%16 = tensor.empty() : tensor<4x128xf16> | |
%17 = vector.transfer_write %15, %16[%c0, %c0] {in_bounds = [true, true]} : vector<4x128xf16>, tensor<4x128xf16> | |
%18 = vector.transfer_read %arg3[%c0], %cst {in_bounds = [true]} : tensor<4xf16>, vector<4xf16> | |
%19 = iree_vector_ext.to_layout %18 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%20 = tensor.empty() : tensor<4xf16> | |
%21 = vector.transfer_write %19, %20[%c0] {in_bounds = [true]} : vector<4xf16>, tensor<4xf16> | |
%22 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%13, %17 : tensor<128xf16>, tensor<4x128xf16>) outs(%21 : tensor<4xf16>) { | |
^bb0(%in: f16, %in_4: f16, %out: f16): | |
%27 = arith.mulf %in, %in_4 : f16 | |
%28 = arith.addf %out, %27 : f16 | |
linalg.yield %28 : f16 | |
} -> tensor<4xf16> | |
%23 = vector.transfer_read %22[%c0], %cst {in_bounds = [true]} : tensor<4xf16>, vector<4xf16> | |
%24 = iree_vector_ext.to_layout %23 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%25 = tensor.empty() : tensor<4xf16> | |
%26 = vector.transfer_write %24, %25[%c0] {in_bounds = [true]} : vector<4xf16>, tensor<4xf16> | |
scf.yield %26 : tensor<4xf16> | |
} | |
%inserted_slice = tensor.insert_slice %9 into %8[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %inserted_slice into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%7 = vector.transfer_write %cst, %extracted_slice_1[%c0] {in_bounds = [true]} : vector<4xf16>, tensor<4xf16> | |
%8 = tensor.empty() : tensor<1x4xf16> | |
%9 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<4xf16>) { | |
%extracted_slice_2 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice_2[0, 0] [1, 128] [1, 1] : tensor<1x128xf16> to tensor<128xf16> | |
%extracted_slice_4 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%10 = vector.transfer_read %extracted_slice_3[%c0], %cst_0 {in_bounds = [true]} : tensor<128xf16>, vector<128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%12 = vector.transfer_read %extracted_slice_4[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x128xf16>, vector<4x128xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%14 = vector.transfer_read %arg3[%c0], %cst_0 {in_bounds = [true]} : tensor<4xf16>, vector<4xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%16 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %11, %13, %15 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%17 = iree_vector_ext.to_layout %16 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%18 = tensor.empty() : tensor<4xf16> | |
%19 = vector.transfer_write %17, %18[%c0] {in_bounds = [true]} : vector<4xf16>, tensor<4xf16> | |
scf.yield %19 : tensor<4xf16> | |
} | |
%inserted_slice = tensor.insert_slice %9 into %8[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %inserted_slice into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%7 = vector.transfer_write %cst, %extracted_slice_1[%c0] {in_bounds = [true]} : vector<4xf16>, tensor<4xf16> | |
%8 = tensor.empty() : tensor<1x4xf16> | |
%9 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<4xf16>) { | |
%extracted_slice_2 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice_2[0, 0] [1, 128] [1, 1] : tensor<1x128xf16> to tensor<128xf16> | |
%extracted_slice_4 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%10 = vector.transfer_read %extracted_slice_3[%c0], %cst_0 {in_bounds = [true]} : tensor<128xf16>, vector<128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%12 = vector.transfer_read %extracted_slice_4[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x128xf16>, vector<4x128xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%14 = vector.transfer_read %arg3[%c0], %cst_0 {in_bounds = [true]} : tensor<4xf16>, vector<4xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%16 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %11, %13, %15 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%17 = iree_vector_ext.to_layout %16 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%18 = tensor.empty() : tensor<4xf16> | |
%19 = vector.transfer_write %17, %18[%c0] {in_bounds = [true]} : vector<4xf16>, tensor<4xf16> | |
scf.yield %19 : tensor<4xf16> | |
} | |
%inserted_slice = tensor.insert_slice %9 into %8[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %inserted_slice into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%extracted_slice_1 = tensor.extract_slice %extracted_slice[0, 0] [1, 4] [1, 1] : tensor<1x4xf16> to tensor<4xf16> | |
%7 = vector.transfer_write %cst, %extracted_slice_1[%c0] {in_bounds = [true]} : vector<4xf16>, tensor<4xf16> | |
%8 = tensor.empty() : tensor<1x4xf16> | |
%9 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %7) -> (tensor<4xf16>) { | |
%extracted_slice_2 = tensor.extract_slice %3[0, %arg2] [1, 128] [1, 1] : tensor<1x4096xf16> to tensor<1x128xf16> | |
%extracted_slice_3 = tensor.extract_slice %extracted_slice_2[0, 0] [1, 128] [1, 1] : tensor<1x128xf16> to tensor<128xf16> | |
%extracted_slice_4 = tensor.extract_slice %4[%arg0, %arg2] [4, 128] [1, 1] : tensor<32000x4096xf16> to tensor<4x128xf16> | |
%10 = vector.transfer_read %extracted_slice_3[%c0], %cst_0 {in_bounds = [true]} : tensor<128xf16>, vector<128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%12 = vector.transfer_read %extracted_slice_4[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x128xf16>, vector<4x128xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%14 = vector.transfer_read %arg3[%c0], %cst_0 {in_bounds = [true]} : tensor<4xf16>, vector<4xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%16 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %11, %13, %15 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%17 = iree_vector_ext.to_layout %16 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%18 = tensor.empty() : tensor<4xf16> | |
%19 = vector.transfer_write %17, %18[%c0] {in_bounds = [true]} : vector<4xf16>, tensor<4xf16> | |
scf.yield %19 : tensor<4xf16> | |
} | |
%inserted_slice = tensor.insert_slice %9 into %8[0, 0] [1, 4] [1, 1] : tensor<4xf16> into tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %inserted_slice into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%7 = tensor.empty() : tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %cst) -> (vector<4xf16>) { | |
%10 = vector.transfer_read %3[%c0, %arg2], %cst_0 {in_bounds = [true]} : tensor<1x4096xf16>, vector<128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%12 = vector.transfer_read %4[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : tensor<32000x4096xf16>, vector<4x128xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%14 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %11, %13, %14 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %16 : vector<4xf16> | |
} | |
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%7 = tensor.empty() : tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %cst) -> (vector<4xf16>) { | |
%10 = vector.transfer_read %3[%c0, %arg2], %cst_0 {in_bounds = [true]} : tensor<1x4096xf16>, vector<128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%12 = vector.transfer_read %4[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : tensor<32000x4096xf16>, vector<4x128xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%14 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %11, %13, %14 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %16 : vector<4xf16> | |
} | |
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%7 = tensor.empty() : tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %cst) -> (vector<4xf16>) { | |
%10 = vector.transfer_read %3[%c0, %arg2], %cst_0 {in_bounds = [true]} : tensor<1x4096xf16>, vector<128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%12 = vector.transfer_read %4[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : tensor<32000x4096xf16>, vector<4x128xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%14 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %11, %13, %14 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %16 : vector<4xf16> | |
} | |
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After GPUVectorAllocPass (iree-codegen-gpu-vector-alloc) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%7 = tensor.empty() : tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %cst) -> (vector<4xf16>) { | |
%10 = vector.transfer_read %3[%c0, %arg2], %cst_0 {in_bounds = [true]} : tensor<1x4096xf16>, vector<128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%12 = vector.transfer_read %4[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : tensor<32000x4096xf16>, vector<4x128xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%14 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %11, %13, %14 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %16 : vector<4xf16> | |
} | |
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%7 = tensor.empty() : tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %cst) -> (vector<4xf16>) { | |
%10 = vector.transfer_read %3[%c0, %arg2], %cst_0 {in_bounds = [true]} : tensor<1x4096xf16>, vector<128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%12 = vector.transfer_read %4[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : tensor<32000x4096xf16>, vector<4x128xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%14 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %11, %13, %14 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %16 : vector<4xf16> | |
} | |
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%7 = tensor.empty() : tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %cst) -> (vector<4xf16>) { | |
%10 = vector.transfer_read %3[%c0, %arg2], %cst_0 {in_bounds = [true]} : tensor<1x4096xf16>, vector<128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%12 = vector.transfer_read %4[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : tensor<32000x4096xf16>, vector<4x128xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%14 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %11, %13, %14 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %16 : vector<4xf16> | |
} | |
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After GPUCombineValueBarriersPass (iree-codegen-gpu-combine-value-barriers) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = tensor.empty() : tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%7 = tensor.empty() : tensor<1x4xf16> | |
%8 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %cst) -> (vector<4xf16>) { | |
%10 = vector.transfer_read %3[%c0, %arg2], %cst_0 {in_bounds = [true]} : tensor<1x4096xf16>, vector<128xf16> | |
%11 = iree_vector_ext.to_layout %10 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%12 = vector.transfer_read %4[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : tensor<32000x4096xf16>, vector<4x128xf16> | |
%13 = iree_vector_ext.to_layout %12 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%14 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%15 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %11, %13, %14 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%16 = iree_vector_ext.to_layout %15 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %16 : vector<4xf16> | |
} | |
%9 = vector.transfer_write %8, %7[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %9 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> -> tensor<1x32000xf16> | |
%6 = tensor.empty() : tensor<1x32000xf16> | |
%7 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%8 = tensor.empty() : tensor<1x4xf16> | |
%9 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %cst) -> (vector<4xf16>) { | |
%11 = vector.transfer_read %3[%c0, %arg2], %cst_0 {in_bounds = [true]} : tensor<1x4096xf16>, vector<128xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%13 = vector.transfer_read %4[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : tensor<32000x4096xf16>, vector<4x128xf16> | |
%14 = iree_vector_ext.to_layout %13 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%15 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%16 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %12, %14, %15 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%17 = iree_vector_ext.to_layout %16 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %17 : vector<4xf16> | |
} | |
%10 = vector.transfer_write %9, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %10 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16> | |
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16> | |
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> -> tensor<1x32000xf16> | |
%6 = scf.forall (%arg0) = (0) to (32000) step (4) shared_outs(%arg1 = %5) -> (tensor<1x32000xf16>) { | |
%extracted_slice = tensor.extract_slice %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x32000xf16> to tensor<1x4xf16> | |
%7 = scf.for %arg2 = %c0 to %c4096 step %c128 iter_args(%arg3 = %cst) -> (vector<4xf16>) { | |
%9 = vector.transfer_read %3[%c0, %arg2], %cst_0 {in_bounds = [true]} : tensor<1x4096xf16>, vector<128xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%11 = vector.transfer_read %4[%arg0, %arg2], %cst_0 {in_bounds = [true, true]} : tensor<32000x4096xf16>, vector<4x128xf16> | |
%12 = iree_vector_ext.to_layout %11 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%13 = iree_vector_ext.to_layout %arg3 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %10, %12, %13 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%15 = iree_vector_ext.to_layout %14 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %15 : vector<4xf16> | |
} | |
%8 = vector.transfer_write %7, %extracted_slice[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, tensor<1x4xf16> | |
scf.forall.in_parallel { | |
tensor.parallel_insert_slice %8 into %arg1[0, %arg0] [1, 4] [1, 1] : tensor<1x4xf16> into tensor<1x32000xf16> | |
} | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>> | |
return | |
} | |
// -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst) -> (vector<4xf16>) { | |
%4 = vector.transfer_read %0[%c0, %arg1], %cst_0 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%6 = vector.transfer_read %1[%arg0, %arg1], %cst_0 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x128xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%8 = iree_vector_ext.to_layout %arg2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %5, %7, %8 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %10 : vector<4xf16> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview_1 : memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
memref.copy %2, %2 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst) -> (vector<4xf16>) { | |
%4 = vector.transfer_read %0[%c0, %arg1], %cst_0 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%6 = vector.transfer_read %1[%arg0, %arg1], %cst_0 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x128xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%8 = iree_vector_ext.to_layout %arg2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %5, %7, %8 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %10 : vector<4xf16> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview_1 : memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
memref.copy %2, %2 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst) -> (vector<4xf16>) { | |
%4 = vector.transfer_read %0[%c0, %arg1], %cst_0 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%6 = vector.transfer_read %1[%arg0, %arg1], %cst_0 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x128xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%8 = iree_vector_ext.to_layout %arg2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %5, %7, %8 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %10 : vector<4xf16> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%subview_1 = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview_1 : memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst) -> (vector<4xf16>) { | |
%4 = vector.transfer_read %0[%c0, %arg1], %cst_0 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%6 = vector.transfer_read %1[%arg0, %arg1], %cst_0 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x128xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%8 = iree_vector_ext.to_layout %arg2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %5, %7, %8 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %10 : vector<4xf16> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
memref.copy %subview, %subview : memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst) -> (vector<4xf16>) { | |
%4 = vector.transfer_read %0[%c0, %arg1], %cst_0 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%6 = vector.transfer_read %1[%arg0, %arg1], %cst_0 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x128xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%8 = iree_vector_ext.to_layout %arg2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %5, %7, %8 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %10 : vector<4xf16> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst) -> (vector<4xf16>) { | |
%4 = vector.transfer_read %0[%c0, %arg1], %cst_0 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%6 = vector.transfer_read %1[%arg0, %arg1], %cst_0 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x128xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%8 = iree_vector_ext.to_layout %arg2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %5, %7, %8 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %10 : vector<4xf16> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst) -> (vector<4xf16>) { | |
%4 = vector.transfer_read %0[%c0, %arg1], %cst_0 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%6 = vector.transfer_read %1[%arg0, %arg1], %cst_0 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x128xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%8 = iree_vector_ext.to_layout %arg2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %5, %7, %8 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %10 : vector<4xf16> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst) -> (vector<4xf16>) { | |
%4 = vector.transfer_read %0[%c0, %arg1], %cst_0 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%6 = vector.transfer_read %1[%arg0, %arg1], %cst_0 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x128xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%8 = iree_vector_ext.to_layout %arg2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %5, %7, %8 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %10 : vector<4xf16> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst) -> (vector<4xf16>) { | |
%4 = vector.transfer_read %0[%c0, %arg1], %cst_0 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%6 = vector.transfer_read %1[%arg0, %arg1], %cst_0 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x128xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%8 = iree_vector_ext.to_layout %arg2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %5, %7, %8 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %10 : vector<4xf16> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst) -> (vector<4xf16>) { | |
%4 = vector.transfer_read %0[%c0, %arg1], %cst_0 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%6 = vector.transfer_read %1[%arg0, %arg1], %cst_0 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x128xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%8 = iree_vector_ext.to_layout %arg2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %5, %7, %8 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %10 : vector<4xf16> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After HoistStaticallyBoundAllocationsPass (iree-codegen-hoist-statically-bound-allocations) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst) -> (vector<4xf16>) { | |
%4 = vector.transfer_read %0[%c0, %arg1], %cst_0 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%6 = vector.transfer_read %1[%arg0, %arg1], %cst_0 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x128xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%8 = iree_vector_ext.to_layout %arg2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %5, %7, %8 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %10 : vector<4xf16> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After LLVMGPUCastTypeToFitMMAPass (iree-llvmgpu-cast-type-to-fit-mma) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<4xf16> | |
%c128 = arith.constant 128 : index | |
%c4096 = arith.constant 4096 : index | |
%c0 = arith.constant 0 : index | |
%cst_0 = arith.constant 0.000000e+00 : f16 | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst) -> (vector<4xf16>) { | |
%4 = vector.transfer_read %0[%c0, %arg1], %cst_0 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<128xf16> | |
%5 = iree_vector_ext.to_layout %4 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [16], element_tile = [8], subgroup_strides = [0], thread_strides = [1]>) : vector<128xf16> | |
%6 = vector.transfer_read %1[%arg0, %arg1], %cst_0 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x128xf16> | |
%7 = iree_vector_ext.to_layout %6 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [4, 16], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [16, 1]>) : vector<4x128xf16> | |
%8 = iree_vector_ext.to_layout %arg2 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
%9 = vector.contract {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"], kind = #vector.kind<add>} %5, %7, %8 : vector<128xf16>, vector<4x128xf16> into vector<4xf16> | |
%10 = iree_vector_ext.to_layout %9 to layout(#iree_vector_ext.nested_layout<subgroup_tile = [1], batch_tile = [1], outer_tile = [1], thread_tile = [4], element_tile = [1], subgroup_strides = [0], thread_strides = [16]>) : vector<4xf16> | |
scf.yield %10 : vector<4xf16> | |
} | |
vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true]} : vector<4xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After LLVMGPUVectorDistributePass (iree-llvmgpu-vector-distribute) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x1x8xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x8xf16> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c128 = arith.constant 128 : index | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x1xf16> | |
%thread_id_z = gpu.thread_id z | |
%thread_id_y = gpu.thread_id y | |
%thread_id_x = gpu.thread_id x | |
%0 = affine.linearize_index disjoint [%thread_id_z, %thread_id_y, %thread_id_x] by (1, 1, 64) : index | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %3, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %3[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%4 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst_3) -> (vector<1x1x1xf16>) { | |
%7 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg1, %0] | |
%8 = vector.transfer_read %1[%c0, %7], %cst_2 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16> | |
%9 = vector.insert_strided_slice %8, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x1x8xf16> | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 4) * 4)>()[%arg0, %0] | |
%11 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg1, %0] | |
%12 = vector.transfer_read %2[%10, %11], %cst_2 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%13 = vector.insert_strided_slice %12, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<1x1x1x1x1x8xf16> | |
%14 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4)>], iterator_types = ["parallel", "reduction", "parallel", "reduction", "parallel", "reduction"], kind = #vector.kind<add>} %9, %13, %cst_3 : vector<1x1x8xf16>, vector<1x1x1x1x1x8xf16> into vector<1x1x1xf16> | |
%15 = vector.shape_cast %14 : vector<1x1x1xf16> to vector<1x1x1x1x1x1xf16> | |
%16 = vector.multi_reduction <add>, %15, %cst_3 [1, 3, 5] : vector<1x1x1x1x1x1xf16> to vector<1x1x1xf16> | |
%17 = vector.extract %16[0, 0, 0] : f16 from vector<1x1x1xf16> | |
%18 = gpu.subgroup_reduce add %17 cluster(size = 16) : (f16) -> f16 | |
%19 = vector.insert %18, %cst [0] : f16 into vector<1xf16> | |
%20 = vector.shape_cast %19 : vector<1xf16> to vector<1x1x1xf16> | |
%21 = arith.addf %20, %arg2 : vector<1x1x1xf16> | |
scf.yield %21 : vector<1x1x1xf16> | |
} | |
%5 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 4)>()[%0] | |
%6 = vector.extract %4[0, 0] : vector<1xf16> from vector<1x1x1xf16> | |
vector.transfer_write %6, %subview[%c0, %5] {in_bounds = [true]} : vector<1xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x1x8xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x8xf16> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c128 = arith.constant 128 : index | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x1xf16> | |
%thread_id_x = gpu.thread_id x | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst_3) -> (vector<1x1x1xf16>) { | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg1, %thread_id_x] | |
%7 = vector.transfer_read %0[%c0, %6], %cst_2 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16> | |
%8 = vector.insert_strided_slice %7, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x1x8xf16> | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 4) * 4)>()[%arg0, %thread_id_x] | |
%10 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg1, %thread_id_x] | |
%11 = vector.transfer_read %1[%9, %10], %cst_2 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%12 = vector.insert_strided_slice %11, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<1x1x1x1x1x8xf16> | |
%13 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4)>], iterator_types = ["parallel", "reduction", "parallel", "reduction", "parallel", "reduction"], kind = #vector.kind<add>} %8, %12, %cst_3 : vector<1x1x8xf16>, vector<1x1x1x1x1x8xf16> into vector<1x1x1xf16> | |
%14 = vector.extract %13[0, 0, 0] : f16 from vector<1x1x1xf16> | |
%15 = gpu.subgroup_reduce add %14 cluster(size = 16) : (f16) -> f16 | |
%16 = vector.insert %15, %cst [0] : f16 into vector<1xf16> | |
%17 = vector.shape_cast %16 : vector<1xf16> to vector<1x1x1xf16> | |
%18 = arith.addf %17, %arg2 : vector<1x1x1xf16> | |
scf.yield %18 : vector<1x1x1xf16> | |
} | |
%4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 4)>()[%thread_id_x] | |
%5 = vector.extract %3[0, 0] : vector<1xf16> from vector<1x1x1xf16> | |
vector.transfer_write %5, %subview[%c0, %4] {in_bounds = [true]} : vector<1xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x1x8xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x8xf16> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c128 = arith.constant 128 : index | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x1xf16> | |
%thread_id_x = gpu.thread_id x | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst_3) -> (vector<1x1x1xf16>) { | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg1, %thread_id_x] | |
%7 = vector.transfer_read %0[%c0, %6], %cst_2 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16> | |
%8 = vector.insert_strided_slice %7, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x1x8xf16> | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 4) * 4)>()[%arg0, %thread_id_x] | |
%10 = vector.transfer_read %1[%9, %6], %cst_2 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%11 = vector.insert_strided_slice %10, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<1x1x1x1x1x8xf16> | |
%12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4)>], iterator_types = ["parallel", "reduction", "parallel", "reduction", "parallel", "reduction"], kind = #vector.kind<add>} %8, %11, %cst_3 : vector<1x1x8xf16>, vector<1x1x1x1x1x8xf16> into vector<1x1x1xf16> | |
%13 = vector.extract %12[0, 0, 0] : f16 from vector<1x1x1xf16> | |
%14 = gpu.subgroup_reduce add %13 cluster(size = 16) : (f16) -> f16 | |
%15 = vector.insert %14, %cst [0] : f16 into vector<1xf16> | |
%16 = vector.shape_cast %15 : vector<1xf16> to vector<1x1x1xf16> | |
%17 = arith.addf %16, %arg2 : vector<1x1x1xf16> | |
scf.yield %17 : vector<1x1x1xf16> | |
} | |
%4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 4)>()[%thread_id_x] | |
%5 = vector.extract %3[0, 0] : vector<1xf16> from vector<1x1x1xf16> | |
vector.transfer_write %5, %subview[%c0, %4] {in_bounds = [true]} : vector<1xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After GPUReduceBankConflictsPass (iree-codegen-gpu-reduce-bank-conflicts) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x1x8xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x8xf16> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c128 = arith.constant 128 : index | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x1xf16> | |
%thread_id_x = gpu.thread_id x | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%subview = memref.subview %2[0, %arg0] [1, 4] [1, 1] : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> to memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst_3) -> (vector<1x1x1xf16>) { | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg1, %thread_id_x] | |
%7 = vector.transfer_read %0[%c0, %6], %cst_2 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16> | |
%8 = vector.insert_strided_slice %7, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x1x8xf16> | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 4) * 4)>()[%arg0, %thread_id_x] | |
%10 = vector.transfer_read %1[%9, %6], %cst_2 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%11 = vector.insert_strided_slice %10, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<1x1x1x1x1x8xf16> | |
%12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4)>], iterator_types = ["parallel", "reduction", "parallel", "reduction", "parallel", "reduction"], kind = #vector.kind<add>} %8, %11, %cst_3 : vector<1x1x8xf16>, vector<1x1x1x1x1x8xf16> into vector<1x1x1xf16> | |
%13 = vector.extract %12[0, 0, 0] : f16 from vector<1x1x1xf16> | |
%14 = gpu.subgroup_reduce add %13 cluster(size = 16) : (f16) -> f16 | |
%15 = vector.insert %14, %cst [0] : f16 into vector<1xf16> | |
%16 = vector.shape_cast %15 : vector<1xf16> to vector<1x1x1xf16> | |
%17 = arith.addf %16, %arg2 : vector<1x1x1xf16> | |
scf.yield %17 : vector<1x1x1xf16> | |
} | |
%4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 4)>()[%thread_id_x] | |
%5 = vector.extract %3[0, 0] : vector<1xf16> from vector<1x1x1xf16> | |
vector.transfer_write %5, %subview[%c0, %4] {in_bounds = [true]} : vector<1xf16>, memref<1x4xf16, strided<[32000, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x1x8xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x8xf16> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c128 = arith.constant 128 : index | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x1xf16> | |
%thread_id_x = gpu.thread_id x | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst_3) -> (vector<1x1x1xf16>) { | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg1, %thread_id_x] | |
%7 = vector.transfer_read %0[%c0, %6], %cst_2 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16> | |
%8 = vector.insert_strided_slice %7, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x1x8xf16> | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 4) * 4)>()[%arg0, %thread_id_x] | |
%10 = vector.transfer_read %1[%9, %6], %cst_2 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%11 = vector.insert_strided_slice %10, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<1x1x1x1x1x8xf16> | |
%12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4)>], iterator_types = ["parallel", "reduction", "parallel", "reduction", "parallel", "reduction"], kind = #vector.kind<add>} %8, %11, %cst_3 : vector<1x1x8xf16>, vector<1x1x1x1x1x8xf16> into vector<1x1x1xf16> | |
%13 = vector.extract %12[0, 0, 0] : f16 from vector<1x1x1xf16> | |
%14 = gpu.subgroup_reduce add %13 cluster(size = 16) : (f16) -> f16 | |
%15 = vector.insert %14, %cst [0] : f16 into vector<1xf16> | |
%16 = vector.shape_cast %15 : vector<1xf16> to vector<1x1x1xf16> | |
%17 = arith.addf %16, %arg2 : vector<1x1x1xf16> | |
scf.yield %17 : vector<1x1x1xf16> | |
} | |
%4 = vector.extract %3[0, 0] : vector<1xf16> from vector<1x1x1xf16> | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 4) * 4)>()[%arg0, %thread_id_x] | |
vector.transfer_write %4, %2[%c0, %5] {in_bounds = [true]} : vector<1xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x1x8xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x8xf16> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c128 = arith.constant 128 : index | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x1xf16> | |
%thread_id_x = gpu.thread_id x | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst_3) -> (vector<1x1x1xf16>) { | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg1, %thread_id_x] | |
%7 = vector.transfer_read %0[%c0, %6], %cst_2 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16> | |
%8 = vector.insert_strided_slice %7, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x1x8xf16> | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 4) * 4)>()[%arg0, %thread_id_x] | |
%10 = vector.transfer_read %1[%9, %6], %cst_2 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%11 = vector.insert_strided_slice %10, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<1x1x1x1x1x8xf16> | |
%12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4)>], iterator_types = ["parallel", "reduction", "parallel", "reduction", "parallel", "reduction"], kind = #vector.kind<add>} %8, %11, %cst_3 : vector<1x1x8xf16>, vector<1x1x1x1x1x8xf16> into vector<1x1x1xf16> | |
%13 = vector.extract %12[0, 0, 0] : f16 from vector<1x1x1xf16> | |
%14 = gpu.subgroup_reduce add %13 cluster(size = 16) : (f16) -> f16 | |
%15 = vector.insert %14, %cst [0] : f16 into vector<1xf16> | |
%16 = vector.shape_cast %15 : vector<1xf16> to vector<1x1x1xf16> | |
%17 = arith.addf %16, %arg2 : vector<1x1x1xf16> | |
scf.yield %17 : vector<1x1x1xf16> | |
} | |
%4 = vector.extract %3[0, 0] : vector<1xf16> from vector<1x1x1xf16> | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 4) * 4)>()[%arg0, %thread_id_x] | |
vector.transfer_write %4, %2[%c0, %5] {in_bounds = [true]} : vector<1xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After Canonicalizer (canonicalize) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x1x8xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x8xf16> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c128 = arith.constant 128 : index | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x1xf16> | |
%thread_id_x = gpu.thread_id x | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst_3) -> (vector<1x1x1xf16>) { | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg1, %thread_id_x] | |
%7 = vector.transfer_read %0[%c0, %6], %cst_2 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16> | |
%8 = vector.insert_strided_slice %7, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x1x8xf16> | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 4) * 4)>()[%arg0, %thread_id_x] | |
%10 = vector.transfer_read %1[%9, %6], %cst_2 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%11 = vector.insert_strided_slice %10, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<1x1x1x1x1x8xf16> | |
%12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4)>], iterator_types = ["parallel", "reduction", "parallel", "reduction", "parallel", "reduction"], kind = #vector.kind<add>} %8, %11, %cst_3 : vector<1x1x8xf16>, vector<1x1x1x1x1x8xf16> into vector<1x1x1xf16> | |
%13 = vector.extract %12[0, 0, 0] : f16 from vector<1x1x1xf16> | |
%14 = gpu.subgroup_reduce add %13 cluster(size = 16) : (f16) -> f16 | |
%15 = vector.insert %14, %cst [0] : f16 into vector<1xf16> | |
%16 = vector.shape_cast %15 : vector<1xf16> to vector<1x1x1xf16> | |
%17 = arith.addf %16, %arg2 : vector<1x1x1xf16> | |
scf.yield %17 : vector<1x1x1xf16> | |
} | |
%4 = vector.extract %3[0, 0] : vector<1xf16> from vector<1x1x1xf16> | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 4) * 4)>()[%arg0, %thread_id_x] | |
vector.transfer_write %4, %2[%c0, %5] {in_bounds = [true]} : vector<1xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After CSE (cse) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x1x8xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x8xf16> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c128 = arith.constant 128 : index | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x1xf16> | |
%thread_id_x = gpu.thread_id x | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst_3) -> (vector<1x1x1xf16>) { | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg1, %thread_id_x] | |
%7 = vector.transfer_read %0[%c0, %6], %cst_2 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16> | |
%8 = vector.insert_strided_slice %7, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x1x8xf16> | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 4) * 4)>()[%arg0, %thread_id_x] | |
%10 = vector.transfer_read %1[%9, %6], %cst_2 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%11 = vector.insert_strided_slice %10, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<1x1x1x1x1x8xf16> | |
%12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4)>], iterator_types = ["parallel", "reduction", "parallel", "reduction", "parallel", "reduction"], kind = #vector.kind<add>} %8, %11, %cst_3 : vector<1x1x8xf16>, vector<1x1x1x1x1x8xf16> into vector<1x1x1xf16> | |
%13 = vector.extract %12[0, 0, 0] : f16 from vector<1x1x1xf16> | |
%14 = gpu.subgroup_reduce add %13 cluster(size = 16) : (f16) -> f16 | |
%15 = vector.insert %14, %cst [0] : f16 into vector<1xf16> | |
%16 = vector.shape_cast %15 : vector<1xf16> to vector<1x1x1xf16> | |
%17 = arith.addf %16, %arg2 : vector<1x1x1xf16> | |
scf.yield %17 : vector<1x1x1xf16> | |
} | |
%4 = vector.extract %3[0, 0] : vector<1xf16> from vector<1x1x1xf16> | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 4) * 4)>()[%arg0, %thread_id_x] | |
vector.transfer_write %4, %2[%c0, %5] {in_bounds = [true]} : vector<1xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} | |
// -----// IR Dump After LLVMGPULowerExecutableTargetPass (iree-llvmgpu-lower-executable-target) //----- // | |
func.func @matvec_fp16() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64, {}>} { | |
%cst = arith.constant dense<0.000000e+00> : vector<1xf16> | |
%cst_0 = arith.constant dense<0.000000e+00> : vector<1x1x1x1x1x8xf16> | |
%cst_1 = arith.constant dense<0.000000e+00> : vector<1x1x8xf16> | |
%cst_2 = arith.constant 0.000000e+00 : f16 | |
%c0 = arith.constant 0 : index | |
%c4096 = arith.constant 4096 : index | |
%c128 = arith.constant 128 : index | |
%cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x1xf16> | |
%thread_id_x = gpu.thread_id x | |
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>> | |
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
scf.forall (%arg0) = (0) to (32000) step (4) { | |
%3 = scf.for %arg1 = %c0 to %c4096 step %c128 iter_args(%arg2 = %cst_3) -> (vector<1x1x1xf16>) { | |
%6 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 8 - (s1 floordiv 16) * 128)>()[%arg1, %thread_id_x] | |
%7 = vector.transfer_read %0[%c0, %6], %cst_2 {in_bounds = [true]} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16> | |
%8 = vector.insert_strided_slice %7, %cst_1 {offsets = [0, 0, 0], strides = [1]} : vector<8xf16> into vector<1x1x8xf16> | |
%9 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 4) * 4)>()[%arg0, %thread_id_x] | |
%10 = vector.transfer_read %1[%9, %6], %cst_2 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<1x8xf16> | |
%11 = vector.insert_strided_slice %10, %cst_0 {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x8xf16> into vector<1x1x1x1x1x8xf16> | |
%12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4)>], iterator_types = ["parallel", "reduction", "parallel", "reduction", "parallel", "reduction"], kind = #vector.kind<add>} %8, %11, %cst_3 : vector<1x1x8xf16>, vector<1x1x1x1x1x8xf16> into vector<1x1x1xf16> | |
%13 = vector.extract %12[0, 0, 0] : f16 from vector<1x1x1xf16> | |
%14 = gpu.subgroup_reduce add %13 cluster(size = 16) : (f16) -> f16 | |
%15 = vector.insert %14, %cst [0] : f16 into vector<1xf16> | |
%16 = vector.shape_cast %15 : vector<1xf16> to vector<1x1x1xf16> | |
%17 = arith.addf %16, %arg2 : vector<1x1x1xf16> | |
scf.yield %17 : vector<1x1x1xf16> | |
} | |
%4 = vector.extract %3[0, 0] : vector<1xf16> from vector<1x1x1xf16> | |
%5 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 floordiv 16 - ((s1 floordiv 16) floordiv 4) * 4)>()[%arg0, %thread_id_x] | |
vector.transfer_write %4, %2[%c0, %5] {in_bounds = [true]} : vector<1xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>> | |
} {mapping = [#iree_codegen.workgroup_mapping<x>]} | |
return | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment