Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created April 22, 2024 16:19
Show Gist options
  • Save pashu123/530bcfea69329faccaf6633931dab239 to your computer and use it in GitHub Desktop.
Save pashu123/530bcfea69329faccaf6633931dab239 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
// -----// IR Dump After TypePropagation (iree-codegen-type-propagation) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = flow.dispatch.workload.ordinal %13, 0 : index
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [8640, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<8640x3200xf16>
%18 = tensor.empty(%15) : tensor<?x540x3200x16x1xf16>
%19 = tensor.empty(%15) : tensor<?x8640x3200xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<8640x3200xf16>) outs(%19 : tensor<?x8640x3200xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x8640x3200xf16>
%pack = tensor.pack %20 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %18 : tensor<?x8640x3200xf16> -> tensor<?x540x3200x16x1xf16>
flow.dispatch.tensor.store %pack, %16, offsets = [0, 0, 0, 0, 0], sizes = [%15, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x540x3200x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
return
}
// -----// IR Dump After BubbleUpOrdinalOps (iree-codegen-bubble-up-ordinal-ops) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = flow.dispatch.workload.ordinal %13, 0 : index
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [8640, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<8640x3200xf16>
%18 = tensor.empty(%15) : tensor<?x540x3200x16x1xf16>
%19 = tensor.empty(%15) : tensor<?x8640x3200xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<8640x3200xf16>) outs(%19 : tensor<?x8640x3200xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x8640x3200xf16>
%pack = tensor.pack %20 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %18 : tensor<?x8640x3200xf16> -> tensor<?x540x3200x16x1xf16>
flow.dispatch.tensor.store %pack, %16, offsets = [0, 0, 0, 0, 0], sizes = [%15, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x540x3200x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
return
}
// -----// IR Dump After BufferizeCopyOnlyDispatches (iree-codegen-bufferize-copy-only-dispatches) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = flow.dispatch.workload.ordinal %13, 0 : index
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [8640, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<8640x3200xf16>
%18 = tensor.empty(%15) : tensor<?x540x3200x16x1xf16>
%19 = tensor.empty(%15) : tensor<?x8640x3200xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<8640x3200xf16>) outs(%19 : tensor<?x8640x3200xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x8640x3200xf16>
%pack = tensor.pack %20 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %18 : tensor<?x8640x3200xf16> -> tensor<?x540x3200x16x1xf16>
flow.dispatch.tensor.store %pack, %16, offsets = [0, 0, 0, 0, 0], sizes = [%15, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x540x3200x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
return
}
// -----// IR Dump After DecomposeSoftmax (iree-codegen-decompose-softmax) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = flow.dispatch.workload.ordinal %13, 0 : index
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [8640, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<8640x3200xf16>
%18 = tensor.empty(%15) : tensor<?x540x3200x16x1xf16>
%19 = tensor.empty(%15) : tensor<?x8640x3200xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<8640x3200xf16>) outs(%19 : tensor<?x8640x3200xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x8640x3200xf16>
%pack = tensor.pack %20 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %18 : tensor<?x8640x3200xf16> -> tensor<?x540x3200x16x1xf16>
flow.dispatch.tensor.store %pack, %16, offsets = [0, 0, 0, 0, 0], sizes = [%15, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x540x3200x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
return
}
// -----// IR Dump After MaterializeUserConfigs (iree-codegen-materialize-user-configs) //----- //
module {
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = flow.dispatch.workload.ordinal %13, 0 : index
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [8640, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<8640x3200xf16>
%18 = tensor.empty(%15) : tensor<?x540x3200x16x1xf16>
%19 = tensor.empty(%15) : tensor<?x8640x3200xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<8640x3200xf16>) outs(%19 : tensor<?x8640x3200xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x8640x3200xf16>
%pack = tensor.pack %20 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %18 : tensor<?x8640x3200xf16> -> tensor<?x540x3200x16x1xf16>
flow.dispatch.tensor.store %pack, %16, offsets = [0, 0, 0, 0, 0], sizes = [%15, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x540x3200x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
return
}
}
// -----// IR Dump After RematerializeParallelOps (iree-codegen-rematerialize-parallel-ops) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = flow.dispatch.workload.ordinal %13, 0 : index
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [8640, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<8640x3200xf16>
%18 = tensor.empty(%15) : tensor<?x540x3200x16x1xf16>
%19 = tensor.empty(%15) : tensor<?x8640x3200xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<8640x3200xf16>) outs(%19 : tensor<?x8640x3200xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x8640x3200xf16>
%pack = tensor.pack %20 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %18 : tensor<?x8640x3200xf16> -> tensor<?x540x3200x16x1xf16>
flow.dispatch.tensor.store %pack, %16, offsets = [0, 0, 0, 0, 0], sizes = [%15, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x540x3200x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
return
}
// -----// IR Dump After ExpandF16OpToF32 (iree-llvmcpu-expand-f16-op-to-f32) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = flow.dispatch.workload.ordinal %13, 0 : index
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [8640, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<8640x3200xf16>
%18 = tensor.empty(%15) : tensor<?x540x3200x16x1xf16>
%19 = tensor.empty(%15) : tensor<?x8640x3200xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<8640x3200xf16>) outs(%19 : tensor<?x8640x3200xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x8640x3200xf16>
%pack = tensor.pack %20 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %18 : tensor<?x8640x3200xf16> -> tensor<?x540x3200x16x1xf16>
flow.dispatch.tensor.store %pack, %16, offsets = [0, 0, 0, 0, 0], sizes = [%15, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x540x3200x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
return
}
// -----// IR Dump After CPUMaterializeEncoding (iree-codegen-cpu-materialize-encoding) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = flow.dispatch.workload.ordinal %13, 0 : index
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [8640, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<8640x3200xf16>
%18 = tensor.empty(%15) : tensor<?x540x3200x16x1xf16>
%19 = tensor.empty(%15) : tensor<?x8640x3200xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<8640x3200xf16>) outs(%19 : tensor<?x8640x3200xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x8640x3200xf16>
%pack = tensor.pack %20 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %18 : tensor<?x8640x3200xf16> -> tensor<?x540x3200x16x1xf16>
flow.dispatch.tensor.store %pack, %16, offsets = [0, 0, 0, 0, 0], sizes = [%15, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x540x3200x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
return
}
// -----// IR Dump After EraseHALDescriptorTypeFromMemRef (iree-codegen-erase-hal-descriptor-type-from-memref) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = flow.dispatch.workload.ordinal %13, 0 : index
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [8640, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<8640x3200xf16>
%18 = tensor.empty(%15) : tensor<?x540x3200x16x1xf16>
%19 = tensor.empty(%15) : tensor<?x8640x3200xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<8640x3200xf16>) outs(%19 : tensor<?x8640x3200xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x8640x3200xf16>
%pack = tensor.pack %20 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %18 : tensor<?x8640x3200xf16> -> tensor<?x540x3200x16x1xf16>
flow.dispatch.tensor.store %pack, %16, offsets = [0, 0, 0, 0, 0], sizes = [%15, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x540x3200x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
return
}
// -----// IR Dump After LLVMCPUSelectLoweringStrategy (iree-llvmcpu-select-lowering-strategy) //----- //
module {
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = flow.dispatch.workload.ordinal %13, 0 : index
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [8640, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<8640x3200xf16>
%18 = tensor.empty(%15) : tensor<?x540x3200x16x1xf16>
%19 = tensor.empty(%15) : tensor<?x8640x3200xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<8640x3200xf16>) outs(%19 : tensor<?x8640x3200xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x8640x3200xf16>
%pack = tensor.pack %20 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %18 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<?x8640x3200xf16> -> tensor<?x540x3200x16x1xf16>
flow.dispatch.tensor.store %pack, %16, offsets = [0, 0, 0, 0, 0], sizes = [%15, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x540x3200x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
return
}
}
// -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,-avx512pf,+sse4.2,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-avx512er,-ccmp,-amx-int8,-kl,-avx10.1-256,-sha512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,+gfni,-avxvnniint16,-amx-fp16,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
hal.executable.export public @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack ordinal(0) layout(#hal.pipeline.layout<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} {
^bb0(%arg0: !hal.device, %arg1: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = flow.dispatch.workload.ordinal %13, 0 : index
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [8640, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<8640x3200xf16>
%18 = tensor.empty(%15) : tensor<?x540x3200x16x1xf16>
%19 = tensor.empty(%15) : tensor<?x8640x3200xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<8640x3200xf16>) outs(%19 : tensor<?x8640x3200xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x8640x3200xf16>
%pack = tensor.pack %20 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %18 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<?x8640x3200xf16> -> tensor<?x540x3200x16x1xf16>
flow.dispatch.tensor.store %pack, %16, offsets = [0, 0, 0, 0, 0], sizes = [%15, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x540x3200x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
return
}
}
}
// -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- //
hal.executable public @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1 {
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,+avx512ifma,+xsave,-avx512pf,+sse4.2,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,+xsavec,-avx10.1-512,+avx512vpopcntdq,+cmov,-avx512vp2intersect,+avx512cd,+movbe,-avxvnniint8,-avx512er,-ccmp,-amx-int8,-kl,-avx10.1-256,-sha512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,+vpclmulqdq,+avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,+gfni,-avxvnniint16,-amx-fp16,-ndd,+xsaveopt,+rdrnd,+avx512f,-amx-bf16,+avx512bf16,+avx512vnni,-push2pop2,+cx8,+avx512bw,+sse3,+pku,+fsgsbase,+clzero,+mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,+wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,+avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,+avx512vbmi2,-prefetchi,+rdpid,-fma4,+avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,+avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
hal.executable.export public @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack ordinal(0) layout(#hal.pipeline.layout<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} {
^bb0(%arg0: !hal.device, %arg1: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = flow.dispatch.workload.ordinal %13, 0 : index
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [8640, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<8640x3200xf16>
%18 = tensor.empty(%15) : tensor<?x540x3200x16x1xf16>
%19 = tensor.empty(%15) : tensor<?x8640x3200xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<8640x3200xf16>) outs(%19 : tensor<?x8640x3200xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x8640x3200xf16>
%pack = tensor.pack %20 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %18 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<?x8640x3200xf16> -> tensor<?x540x3200x16x1xf16>
flow.dispatch.tensor.store %pack, %16, offsets = [0, 0, 0, 0, 0], sizes = [%15, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x540x3200x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
return
}
}
}
}
// -----// IR Dump After LowerExecutableUsingTransformDialect (iree-codegen-lower-executable-using-transform-dialect) //----- //
module {
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = flow.dispatch.workload.ordinal %13, 0 : index
%16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
%17 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [8640, 3200], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<8640x3200xf16>
%18 = tensor.empty(%15) : tensor<?x540x3200x16x1xf16>
%19 = tensor.empty(%15) : tensor<?x8640x3200xf16>
%20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%17 : tensor<8640x3200xf16>) outs(%19 : tensor<?x8640x3200xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x8640x3200xf16>
%pack = tensor.pack %20 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %18 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<?x8640x3200xf16> -> tensor<?x540x3200x16x1xf16>
flow.dispatch.tensor.store %pack, %16, offsets = [0, 0, 0, 0, 0], sizes = [%15, 540, 3200, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x540x3200x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%15}
return
}
}
// -----// IR Dump After TileAndDistributeToWorkgroups (iree-codegen-tile-and-distribute-to-workgroups) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%24 = flow.dispatch.tensor.load %14, offsets = [%23, %arg2], sizes = [%c64, %c64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<?x?xf16>
%25 = tensor.empty(%18) : tensor<?x64x64xf16>
%cast = tensor.cast %24 : tensor<?x?xf16> to tensor<64x64xf16>
%26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cast : tensor<64x64xf16>) outs(%25 : tensor<?x64x64xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x64x64xf16>
%27 = tensor.empty(%18) : tensor<?x4x64x16x1xf16>
%pack = tensor.pack %26 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %27 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<?x64x64xf16> -> tensor<?x4x64x16x1xf16>
%cast_0 = tensor.cast %pack : tensor<?x4x64x16x1xf16> to tensor<?x?x?x16x1xf16>
%28 = arith.extui %2 : i32 to i64
%29 = arith.extui %3 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
flow.dispatch.tensor.store %cast_0, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, %c4, %c64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x?x?x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%32}
}
}
}
return
}
// -----// IR Dump After ConvertToDestinationPassingStyle (iree-codegen-convert-to-destination-passing-style) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %2 : i32 to i64
%29 = arith.extui %3 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%32]
%34 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%33, %c4, %c64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%27} -> tensor<?x?x?x16x1xf16>
%cast = tensor.cast %34 : tensor<?x?x?x16x1xf16> to tensor<?x4x64x16x1xf16>
%35 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%36 = flow.dispatch.tensor.load %14, offsets = [%35, %arg2], sizes = [%c64, %c64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<?x?xf16>
%37 = tensor.empty(%18) : tensor<?x64x64xf16>
%cast_0 = tensor.cast %36 : tensor<?x?xf16> to tensor<64x64xf16>
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cast_0 : tensor<64x64xf16>) outs(%37 : tensor<?x64x64xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x64x64xf16>
%pack = tensor.pack %38 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %cast {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<?x64x64xf16> -> tensor<?x4x64x16x1xf16>
%cast_1 = tensor.cast %pack : tensor<?x4x64x16x1xf16> to tensor<?x?x?x16x1xf16>
%39 = arith.extui %2 : i32 to i64
%40 = arith.extui %3 : i32 to i64
%41 = arith.shli %40, %c32_i64 : i64
%42 = arith.ori %39, %41 : i64
%43 = arith.index_castui %42 : i64 to index
flow.dispatch.tensor.store %cast_1, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, %c4, %c64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x?x?x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%43}
}
}
}
return
}
// -----// IR Dump After FoldAffineMinInDistributedLoops (iree-codegen-fold-affinemin-in-distributed-loops) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = arith.extui %2 : i32 to i64
%29 = arith.extui %3 : i32 to i64
%30 = arith.shli %29, %c32_i64 : i64
%31 = arith.ori %28, %30 : i64
%32 = arith.index_castui %31 : i64 to index
%33 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%32]
%34 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%33, %c4, %c64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%27} -> tensor<?x?x?x16x1xf16>
%cast = tensor.cast %34 : tensor<?x?x?x16x1xf16> to tensor<?x4x64x16x1xf16>
%35 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%36 = flow.dispatch.tensor.load %14, offsets = [%35, %arg2], sizes = [%c64, %c64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<?x?xf16>
%37 = tensor.empty(%18) : tensor<?x64x64xf16>
%cast_0 = tensor.cast %36 : tensor<?x?xf16> to tensor<64x64xf16>
%38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cast_0 : tensor<64x64xf16>) outs(%37 : tensor<?x64x64xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x64x64xf16>
%pack = tensor.pack %38 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %cast {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<?x64x64xf16> -> tensor<?x4x64x16x1xf16>
%cast_1 = tensor.cast %pack : tensor<?x4x64x16x1xf16> to tensor<?x?x?x16x1xf16>
%39 = arith.extui %2 : i32 to i64
%40 = arith.extui %3 : i32 to i64
%41 = arith.shli %40, %c32_i64 : i64
%42 = arith.ori %39, %41 : i64
%43 = arith.index_castui %42 : i64 to index
flow.dispatch.tensor.store %cast_1, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, %c4, %c64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x?x?x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%43}
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = arith.extui %2 : i32 to i64
%24 = arith.extui %3 : i32 to i64
%25 = arith.shli %24, %c32_i64 : i64
%26 = arith.ori %23, %25 : i64
%27 = arith.index_castui %26 : i64 to index
%28 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%27]
%29 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%28, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%30 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%31 = flow.dispatch.tensor.load %14, offsets = [%30, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%32 = tensor.empty(%18) : tensor<?x64x64xf16>
%33 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%31 : tensor<64x64xf16>) outs(%32 : tensor<?x64x64xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x64x64xf16>
%pack = tensor.pack %33 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %29 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<?x64x64xf16> -> tensor<?x4x64x16x1xf16>
flow.dispatch.tensor.store %pack, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = tensor.empty(%18) : tensor<?x64x64xf16>
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%25 : tensor<64x64xf16>) outs(%26 : tensor<?x64x64xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x64x64xf16>
%pack = tensor.pack %27 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %23 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<?x64x64xf16> -> tensor<?x4x64x16x1xf16>
flow.dispatch.tensor.store %pack, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After FuseTensorPadWithConsumer (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = tensor.empty(%18) : tensor<?x64x64xf16>
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%25 : tensor<64x64xf16>) outs(%26 : tensor<?x64x64xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x64x64xf16>
%pack = tensor.pack %27 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %23 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<?x64x64xf16> -> tensor<?x4x64x16x1xf16>
flow.dispatch.tensor.store %pack, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After ConcretizePadResultShape (iree-codegen-concretize-pad-result-shape) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = tensor.empty(%18) : tensor<?x64x64xf16>
%27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%25 : tensor<64x64xf16>) outs(%26 : tensor<?x64x64xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<?x64x64xf16>
%pack = tensor.pack %27 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %23 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<?x64x64xf16> -> tensor<?x4x64x16x1xf16>
flow.dispatch.tensor.store %pack, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After LLVMCPUTileAndFuse (iree-llvmcpu-tile-and-fuse) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = scf.for %arg3 = %c0 to %18 step %c1 iter_args(%arg4 = %23) -> (tensor<?x4x64x16x1xf16>) {
%27 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%extracted_slice = tensor.extract_slice %25[0, %arg5] [64, 16] [1, 1] : tensor<64x64xf16> to tensor<64x16xf16>
%28 = tensor.empty() : tensor<1x64x16xf16>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<64x16xf16>) outs(%28 : tensor<1x64x16xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x16xf16>
%extracted_slice_0 = tensor.extract_slice %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> to tensor<1x4x16x16x1xf16>
%pack = tensor.pack %29 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %extracted_slice_0 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<1x64x16xf16> -> tensor<1x4x16x16x1xf16>
%inserted_slice = tensor.insert_slice %pack into %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<1x4x16x16x1xf16> into tensor<?x4x64x16x1xf16>
scf.yield %inserted_slice : tensor<?x4x64x16x1xf16>
}
scf.yield %27 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %26, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After FuseTensorPadWithConsumer (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = scf.for %arg3 = %c0 to %18 step %c1 iter_args(%arg4 = %23) -> (tensor<?x4x64x16x1xf16>) {
%27 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%extracted_slice = tensor.extract_slice %25[0, %arg5] [64, 16] [1, 1] : tensor<64x64xf16> to tensor<64x16xf16>
%28 = tensor.empty() : tensor<1x64x16xf16>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<64x16xf16>) outs(%28 : tensor<1x64x16xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x16xf16>
%extracted_slice_0 = tensor.extract_slice %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> to tensor<1x4x16x16x1xf16>
%pack = tensor.pack %29 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %extracted_slice_0 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<1x64x16xf16> -> tensor<1x4x16x16x1xf16>
%inserted_slice = tensor.insert_slice %pack into %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<1x4x16x16x1xf16> into tensor<?x4x64x16x1xf16>
scf.yield %inserted_slice : tensor<?x4x64x16x1xf16>
}
scf.yield %27 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %26, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After ConcretizePadResultShape (iree-codegen-concretize-pad-result-shape) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = scf.for %arg3 = %c0 to %18 step %c1 iter_args(%arg4 = %23) -> (tensor<?x4x64x16x1xf16>) {
%27 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%extracted_slice = tensor.extract_slice %25[0, %arg5] [64, 16] [1, 1] : tensor<64x64xf16> to tensor<64x16xf16>
%28 = tensor.empty() : tensor<1x64x16xf16>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<64x16xf16>) outs(%28 : tensor<1x64x16xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x16xf16>
%extracted_slice_0 = tensor.extract_slice %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> to tensor<1x4x16x16x1xf16>
%pack = tensor.pack %29 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %extracted_slice_0 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<1x64x16xf16> -> tensor<1x4x16x16x1xf16>
%inserted_slice = tensor.insert_slice %pack into %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<1x4x16x16x1xf16> into tensor<?x4x64x16x1xf16>
scf.yield %inserted_slice : tensor<?x4x64x16x1xf16>
}
scf.yield %27 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %26, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After LLVMCPUSplitReduction (iree-llvmcpu-split-reduction) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = scf.for %arg3 = %c0 to %18 step %c1 iter_args(%arg4 = %23) -> (tensor<?x4x64x16x1xf16>) {
%27 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%extracted_slice = tensor.extract_slice %25[0, %arg5] [64, 16] [1, 1] : tensor<64x64xf16> to tensor<64x16xf16>
%28 = tensor.empty() : tensor<1x64x16xf16>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<64x16xf16>) outs(%28 : tensor<1x64x16xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x16xf16>
%extracted_slice_0 = tensor.extract_slice %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> to tensor<1x4x16x16x1xf16>
%pack = tensor.pack %29 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %extracted_slice_0 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<1x64x16xf16> -> tensor<1x4x16x16x1xf16>
%inserted_slice = tensor.insert_slice %pack into %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<1x4x16x16x1xf16> into tensor<?x4x64x16x1xf16>
scf.yield %inserted_slice : tensor<?x4x64x16x1xf16>
}
scf.yield %27 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %26, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After LLVMCPUTile (iree-llvmcpu-tile) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = scf.for %arg3 = %c0 to %18 step %c1 iter_args(%arg4 = %23) -> (tensor<?x4x64x16x1xf16>) {
%27 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%extracted_slice = tensor.extract_slice %25[0, %arg5] [64, 16] [1, 1] : tensor<64x64xf16> to tensor<64x16xf16>
%28 = tensor.empty() : tensor<1x64x16xf16>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<64x16xf16>) outs(%28 : tensor<1x64x16xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x16xf16>
%extracted_slice_0 = tensor.extract_slice %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> to tensor<1x4x16x16x1xf16>
%pack = tensor.pack %29 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %extracted_slice_0 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<1x64x16xf16> -> tensor<1x4x16x16x1xf16>
%inserted_slice = tensor.insert_slice %pack into %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<1x4x16x16x1xf16> into tensor<?x4x64x16x1xf16>
scf.yield %inserted_slice : tensor<?x4x64x16x1xf16>
}
scf.yield %27 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %26, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After LLVMCPUTileAndFuse (iree-llvmcpu-tile-and-fuse) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = scf.for %arg3 = %c0 to %18 step %c1 iter_args(%arg4 = %23) -> (tensor<?x4x64x16x1xf16>) {
%27 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%extracted_slice = tensor.extract_slice %25[0, %arg5] [64, 16] [1, 1] : tensor<64x64xf16> to tensor<64x16xf16>
%28 = tensor.empty() : tensor<1x64x16xf16>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<64x16xf16>) outs(%28 : tensor<1x64x16xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x16xf16>
%extracted_slice_0 = tensor.extract_slice %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> to tensor<1x4x16x16x1xf16>
%pack = tensor.pack %29 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %extracted_slice_0 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<1x64x16xf16> -> tensor<1x4x16x16x1xf16>
%inserted_slice = tensor.insert_slice %pack into %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<1x4x16x16x1xf16> into tensor<?x4x64x16x1xf16>
scf.yield %inserted_slice : tensor<?x4x64x16x1xf16>
}
scf.yield %27 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %26, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After FuseTensorPadWithConsumer (iree-codegen-fuse-tensor-pad-with-consumer) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = scf.for %arg3 = %c0 to %18 step %c1 iter_args(%arg4 = %23) -> (tensor<?x4x64x16x1xf16>) {
%27 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%extracted_slice = tensor.extract_slice %25[0, %arg5] [64, 16] [1, 1] : tensor<64x64xf16> to tensor<64x16xf16>
%28 = tensor.empty() : tensor<1x64x16xf16>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<64x16xf16>) outs(%28 : tensor<1x64x16xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x16xf16>
%extracted_slice_0 = tensor.extract_slice %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> to tensor<1x4x16x16x1xf16>
%pack = tensor.pack %29 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %extracted_slice_0 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<1x64x16xf16> -> tensor<1x4x16x16x1xf16>
%inserted_slice = tensor.insert_slice %pack into %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<1x4x16x16x1xf16> into tensor<?x4x64x16x1xf16>
scf.yield %inserted_slice : tensor<?x4x64x16x1xf16>
}
scf.yield %27 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %26, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After ConcretizePadResultShape (iree-codegen-concretize-pad-result-shape) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = scf.for %arg3 = %c0 to %18 step %c1 iter_args(%arg4 = %23) -> (tensor<?x4x64x16x1xf16>) {
%27 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%extracted_slice = tensor.extract_slice %25[0, %arg5] [64, 16] [1, 1] : tensor<64x64xf16> to tensor<64x16xf16>
%28 = tensor.empty() : tensor<1x64x16xf16>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<64x16xf16>) outs(%28 : tensor<1x64x16xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x16xf16>
%extracted_slice_0 = tensor.extract_slice %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> to tensor<1x4x16x16x1xf16>
%pack = tensor.pack %29 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %extracted_slice_0 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<1x64x16xf16> -> tensor<1x4x16x16x1xf16>
%inserted_slice = tensor.insert_slice %pack into %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<1x4x16x16x1xf16> into tensor<?x4x64x16x1xf16>
scf.yield %inserted_slice : tensor<?x4x64x16x1xf16>
}
scf.yield %27 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %26, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After TensorToVectorVectorizePad (iree-codegen-vectorize-tensor-pad) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = scf.for %arg3 = %c0 to %18 step %c1 iter_args(%arg4 = %23) -> (tensor<?x4x64x16x1xf16>) {
%27 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%extracted_slice = tensor.extract_slice %25[0, %arg5] [64, 16] [1, 1] : tensor<64x64xf16> to tensor<64x16xf16>
%28 = tensor.empty() : tensor<1x64x16xf16>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<64x16xf16>) outs(%28 : tensor<1x64x16xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x16xf16>
%extracted_slice_0 = tensor.extract_slice %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> to tensor<1x4x16x16x1xf16>
%pack = tensor.pack %29 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %extracted_slice_0 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4, 64], [1, 0, 16], [0, 0, 0], [0, 0, 0]]>} : tensor<1x64x16xf16> -> tensor<1x4x16x16x1xf16>
%inserted_slice = tensor.insert_slice %pack into %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<1x4x16x16x1xf16> into tensor<?x4x64x16x1xf16>
scf.yield %inserted_slice : tensor<?x4x64x16x1xf16>
}
scf.yield %27 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %26, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After DecomposePackUnPackOps (iree-codegen-decompose-pack-unpack-ops) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = scf.for %arg3 = %c0 to %18 step %c1 iter_args(%arg4 = %23) -> (tensor<?x4x64x16x1xf16>) {
%27 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%extracted_slice = tensor.extract_slice %25[0, %arg5] [64, 16] [1, 1] : tensor<64x64xf16> to tensor<64x16xf16>
%28 = tensor.empty() : tensor<1x64x16xf16>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<64x16xf16>) outs(%28 : tensor<1x64x16xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x16xf16>
%extracted_slice_0 = tensor.extract_slice %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> to tensor<1x4x16x16x1xf16>
%expanded = tensor.expand_shape %29 [[0], [1, 2], [3, 4]] : tensor<1x64x16xf16> into tensor<1x4x16x16x1xf16>
%transposed = linalg.transpose ins(%expanded : tensor<1x4x16x16x1xf16>) outs(%extracted_slice_0 : tensor<1x4x16x16x1xf16>) permutation = [0, 1, 3, 2, 4]
%inserted_slice = tensor.insert_slice %transposed into %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<1x4x16x16x1xf16> into tensor<?x4x64x16x1xf16>
scf.yield %inserted_slice : tensor<?x4x64x16x1xf16>
}
scf.yield %27 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %26, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = scf.for %arg3 = %c0 to %18 step %c1 iter_args(%arg4 = %23) -> (tensor<?x4x64x16x1xf16>) {
%27 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%extracted_slice = tensor.extract_slice %25[0, %arg5] [64, 16] [1, 1] : tensor<64x64xf16> to tensor<64x16xf16>
%28 = tensor.empty() : tensor<1x64x16xf16>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<64x16xf16>) outs(%28 : tensor<1x64x16xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x16xf16>
%extracted_slice_0 = tensor.extract_slice %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> to tensor<1x4x16x16x1xf16>
%expanded = tensor.expand_shape %29 [[0], [1, 2], [3, 4]] : tensor<1x64x16xf16> into tensor<1x4x16x16x1xf16>
%transposed = linalg.transpose ins(%expanded : tensor<1x4x16x16x1xf16>) outs(%extracted_slice_0 : tensor<1x4x16x16x1xf16>) permutation = [0, 1, 3, 2, 4]
%inserted_slice = tensor.insert_slice %transposed into %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<1x4x16x16x1xf16> into tensor<?x4x64x16x1xf16>
scf.yield %inserted_slice : tensor<?x4x64x16x1xf16>
}
scf.yield %27 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %26, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = scf.for %arg3 = %c0 to %18 step %c1 iter_args(%arg4 = %23) -> (tensor<?x4x64x16x1xf16>) {
%27 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%extracted_slice = tensor.extract_slice %25[0, %arg5] [64, 16] [1, 1] : tensor<64x64xf16> to tensor<64x16xf16>
%28 = tensor.empty() : tensor<1x64x16xf16>
%29 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<64x16xf16>) outs(%28 : tensor<1x64x16xf16>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64], [1, 1, 16], [0, 0, 0], [0, 0, 0]]>} {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
} -> tensor<1x64x16xf16>
%extracted_slice_0 = tensor.extract_slice %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> to tensor<1x4x16x16x1xf16>
%expanded = tensor.expand_shape %29 [[0], [1, 2], [3, 4]] : tensor<1x64x16xf16> into tensor<1x4x16x16x1xf16>
%transposed = linalg.transpose ins(%expanded : tensor<1x4x16x16x1xf16>) outs(%extracted_slice_0 : tensor<1x4x16x16x1xf16>) permutation = [0, 1, 3, 2, 4]
%inserted_slice = tensor.insert_slice %transposed into %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<1x4x16x16x1xf16> into tensor<?x4x64x16x1xf16>
scf.yield %inserted_slice : tensor<?x4x64x16x1xf16>
}
scf.yield %27 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %26, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After GenericVectorization (iree-codegen-generic-vectorization) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f16
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
scf.for %arg0 = %16 to %13 step %17 {
%18 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg1 = %19 to %c540 step %20 {
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%22 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg2 = %21 to %c3200 step %22 {
%23 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
%25 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%26 = scf.for %arg3 = %c0 to %18 step %c1 iter_args(%arg4 = %23) -> (tensor<?x4x64x16x1xf16>) {
%27 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%extracted_slice = tensor.extract_slice %25[0, %arg5] [64, 16] [1, 1] : tensor<64x64xf16> to tensor<64x16xf16>
%28 = tensor.empty() : tensor<1x64x16xf16>
%29 = vector.transfer_read %extracted_slice[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<64x16xf16>, vector<64x16xf16>
%30 = vector.broadcast %29 : vector<64x16xf16> to vector<1x64x16xf16>
%31 = vector.transfer_write %30, %28[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x16xf16>, tensor<1x64x16xf16>
%extracted_slice_0 = tensor.extract_slice %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> to tensor<1x4x16x16x1xf16>
%expanded = tensor.expand_shape %31 [[0], [1, 2], [3, 4]] : tensor<1x64x16xf16> into tensor<1x4x16x16x1xf16>
%32 = vector.transfer_read %expanded[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x16x16x1xf16>, vector<1x4x16x16x1xf16>
%33 = vector.transpose %32, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
%34 = vector.transfer_write %33, %extracted_slice_0[%c0, %c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x4x16x16x1xf16>, tensor<1x4x16x16x1xf16>
%inserted_slice = tensor.insert_slice %34 into %arg6[%arg3, 0, %arg5, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : tensor<1x4x16x16x1xf16> into tensor<?x4x64x16x1xf16>
scf.yield %inserted_slice : tensor<?x4x64x16x1xf16>
}
scf.yield %27 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %26, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%18, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After OptimizeTensorInsertExtractSlices (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f16
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%22 = tensor.empty() : tensor<1x64x16xf16>
scf.for %arg0 = %16 to %13 step %17 {
%23 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%25 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%23, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%26 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%27 = scf.for %arg3 = %c0 to %23 step %c1 iter_args(%arg4 = %25) -> (tensor<?x4x64x16x1xf16>) {
%28 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%29 = vector.transfer_read %26[%c0, %arg5], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x16xf16>
%30 = vector.broadcast %29 : vector<64x16xf16> to vector<1x64x16xf16>
%31 = vector.transfer_write %30, %22[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x16xf16>, tensor<1x64x16xf16>
%expanded = tensor.expand_shape %31 [[0], [1, 2], [3, 4]] : tensor<1x64x16xf16> into tensor<1x4x16x16x1xf16>
%32 = vector.transfer_read %expanded[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x16x16x1xf16>, vector<1x4x16x16x1xf16>
%33 = vector.transpose %32, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
%34 = vector.transfer_write %33, %arg6[%arg3, %c0, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x4x16x16x1xf16>, tensor<?x4x64x16x1xf16>
scf.yield %34 : tensor<?x4x64x16x1xf16>
}
scf.yield %28 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %27, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%23, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f16
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%22 = tensor.empty() : tensor<1x64x16xf16>
scf.for %arg0 = %16 to %13 step %17 {
%23 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%25 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%23, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%26 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%27 = scf.for %arg3 = %c0 to %23 step %c1 iter_args(%arg4 = %25) -> (tensor<?x4x64x16x1xf16>) {
%28 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%29 = vector.transfer_read %26[%c0, %arg5], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x16xf16>
%30 = vector.broadcast %29 : vector<64x16xf16> to vector<1x64x16xf16>
%31 = vector.transfer_write %30, %22[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x16xf16>, tensor<1x64x16xf16>
%expanded = tensor.expand_shape %31 [[0], [1, 2], [3, 4]] : tensor<1x64x16xf16> into tensor<1x4x16x16x1xf16>
%32 = vector.transfer_read %expanded[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x16x16x1xf16>, vector<1x4x16x16x1xf16>
%33 = vector.transpose %32, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
%34 = vector.transfer_write %33, %arg6[%arg3, %c0, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x4x16x16x1xf16>, tensor<?x4x64x16x1xf16>
scf.yield %34 : tensor<?x4x64x16x1xf16>
}
scf.yield %28 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %27, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%23, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f16
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%22 = tensor.empty() : tensor<1x64x16xf16>
scf.for %arg0 = %16 to %13 step %17 {
%23 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%25 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%23, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%26 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%27 = scf.for %arg3 = %c0 to %23 step %c1 iter_args(%arg4 = %25) -> (tensor<?x4x64x16x1xf16>) {
%28 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%29 = vector.transfer_read %26[%c0, %arg5], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x16xf16>
%30 = vector.broadcast %29 : vector<64x16xf16> to vector<1x64x16xf16>
%31 = vector.transfer_write %30, %22[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x16xf16>, tensor<1x64x16xf16>
%expanded = tensor.expand_shape %31 [[0], [1, 2], [3, 4]] : tensor<1x64x16xf16> into tensor<1x4x16x16x1xf16>
%32 = vector.transfer_read %expanded[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x16x16x1xf16>, vector<1x4x16x16x1xf16>
%33 = vector.transpose %32, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
%34 = vector.transfer_write %33, %arg6[%arg3, %c0, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x4x16x16x1xf16>, tensor<?x4x64x16x1xf16>
scf.yield %34 : tensor<?x4x64x16x1xf16>
}
scf.yield %28 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %27, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%23, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After EliminateEmptyTensors (iree-eliminate-empty-tensors) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f16
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%22 = tensor.empty() : tensor<1x64x16xf16>
scf.for %arg0 = %16 to %13 step %17 {
%23 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%25 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%23, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%26 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%27 = scf.for %arg3 = %c0 to %23 step %c1 iter_args(%arg4 = %25) -> (tensor<?x4x64x16x1xf16>) {
%28 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%29 = vector.transfer_read %26[%c0, %arg5], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x16xf16>
%30 = vector.broadcast %29 : vector<64x16xf16> to vector<1x64x16xf16>
%31 = vector.transfer_write %30, %22[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x16xf16>, tensor<1x64x16xf16>
%expanded = tensor.expand_shape %31 [[0], [1, 2], [3, 4]] : tensor<1x64x16xf16> into tensor<1x4x16x16x1xf16>
%32 = vector.transfer_read %expanded[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x16x16x1xf16>, vector<1x4x16x16x1xf16>
%33 = vector.transpose %32, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
%34 = vector.transfer_write %33, %arg6[%arg3, %c0, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x4x16x16x1xf16>, tensor<?x4x64x16x1xf16>
scf.yield %34 : tensor<?x4x64x16x1xf16>
}
scf.yield %28 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %27, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%23, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%cst = arith.constant 0.000000e+00 : f16
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c3200 = arith.constant 3200 : index
%c540 = arith.constant 540 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
%22 = bufferization.alloc_tensor() : tensor<1x64x16xf16>
scf.for %arg0 = %16 to %13 step %17 {
%23 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%24 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%25 = flow.dispatch.tensor.load %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%23, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13} -> tensor<?x4x64x16x1xf16>
%26 = flow.dispatch.tensor.load %14, offsets = [%24, %arg2], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8640x3200xf16>> -> tensor<64x64xf16>
%27 = scf.for %arg3 = %c0 to %23 step %c1 iter_args(%arg4 = %25) -> (tensor<?x4x64x16x1xf16>) {
%28 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (tensor<?x4x64x16x1xf16>) {
%29 = vector.transfer_read %26[%c0, %arg5], %cst {in_bounds = [true, true]} : tensor<64x64xf16>, vector<64x16xf16>
%30 = vector.broadcast %29 : vector<64x16xf16> to vector<1x64x16xf16>
%31 = vector.transfer_write %30, %22[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x16xf16>, tensor<1x64x16xf16>
%expanded = tensor.expand_shape %31 [[0], [1, 2], [3, 4]] : tensor<1x64x16xf16> into tensor<1x4x16x16x1xf16>
%32 = vector.transfer_read %expanded[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : tensor<1x4x16x16x1xf16>, vector<1x4x16x16x1xf16>
%33 = vector.transpose %32, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
%34 = vector.transfer_write %33, %arg6[%arg3, %c0, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x4x16x16x1xf16>, tensor<?x4x64x16x1xf16>
scf.yield %34 : tensor<?x4x64x16x1xf16>
}
scf.yield %28 : tensor<?x4x64x16x1xf16>
}
flow.dispatch.tensor.store %27, %15, offsets = [%arg0, %arg1, %arg2, 0, 0], sizes = [%23, 4, 64, 16, 1], strides = [1, 1, 1, 1, 1] : tensor<?x4x64x16x1xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x540x3200x16x1xf16>>{%13}
}
}
}
return
}
// -----// IR Dump After IREEComprehensiveBufferize (iree-codegen-iree-comprehensive-bufferize) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c540 = arith.constant 540 : index
%c3200 = arith.constant 3200 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f16
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x64x16xf16>
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %14, 64 : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%13}
memref.assume_alignment %15, 1 : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %16 to %13 step %17 {
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%subview = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %14[%23, %arg2] [64, 64] [1, 1] : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%24 = scf.for %arg3 = %c0 to %22 step %c1 iter_args(%arg4 = %subview) -> (memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
%25 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
%26 = vector.transfer_read %subview_0[%c0, %arg5], %cst {in_bounds = [true, true]} : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x16xf16>
%27 = vector.broadcast %26 : vector<64x16xf16> to vector<1x64x16xf16>
vector.transfer_write %27, %alloca[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x16xf16>, memref<1x64x16xf16>
%expand_shape = memref.expand_shape %alloca [[0], [1, 2], [3, 4]] : memref<1x64x16xf16> into memref<1x4x16x16x1xf16>
%28 = vector.transfer_read %expand_shape[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : memref<1x4x16x16x1xf16>, vector<1x4x16x16x1xf16>
%29 = vector.transpose %28, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
vector.transfer_write %29, %arg6[%arg3, %c0, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x4x16x16x1xf16>, memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.yield %arg6 : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
scf.yield %25 : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
%subview_1 = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%24 : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
}
}
}
}
return
}
// -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c540 = arith.constant 540 : index
%c3200 = arith.constant 3200 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f16
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x64x16xf16>
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %14, 64 : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%13}
memref.assume_alignment %15, 1 : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %16 to %13 step %17 {
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%subview = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %14[%23, %arg2] [64, 64] [1, 1] : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%24 = scf.for %arg3 = %c0 to %22 step %c1 iter_args(%arg4 = %subview) -> (memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
%25 = scf.for %arg5 = %c0 to %c64 step %c16 iter_args(%arg6 = %arg4) -> (memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
%26 = vector.transfer_read %subview_0[%c0, %arg5], %cst {in_bounds = [true, true]} : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x16xf16>
%27 = vector.broadcast %26 : vector<64x16xf16> to vector<1x64x16xf16>
vector.transfer_write %27, %alloca[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x16xf16>, memref<1x64x16xf16>
%expand_shape = memref.expand_shape %alloca [[0], [1, 2], [3, 4]] : memref<1x64x16xf16> into memref<1x4x16x16x1xf16>
%28 = vector.transfer_read %expand_shape[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : memref<1x4x16x16x1xf16>, vector<1x4x16x16x1xf16>
%29 = vector.transpose %28, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
vector.transfer_write %29, %arg6[%arg3, %c0, %arg5, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x4x16x16x1xf16>, memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.yield %arg6 : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
scf.yield %25 : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
%subview_1 = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%24 : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
}
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c540 = arith.constant 540 : index
%c3200 = arith.constant 3200 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f16
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x64x16xf16>
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %14, 64 : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%13}
memref.assume_alignment %15, 1 : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %16 to %13 step %17 {
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%subview = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %14[%23, %arg2] [64, 64] [1, 1] : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %22 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c16 {
%24 = vector.transfer_read %subview_0[%c0, %arg4], %cst {in_bounds = [true, true]} : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x16xf16>
%25 = vector.broadcast %24 : vector<64x16xf16> to vector<1x64x16xf16>
vector.transfer_write %25, %alloca[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x16xf16>, memref<1x64x16xf16>
%expand_shape = memref.expand_shape %alloca [[0], [1, 2], [3, 4]] : memref<1x64x16xf16> into memref<1x4x16x16x1xf16>
%26 = vector.transfer_read %expand_shape[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : memref<1x4x16x16x1xf16>, vector<1x4x16x16x1xf16>
%27 = vector.transpose %26, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
vector.transfer_write %27, %subview[%arg3, %c0, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x4x16x16x1xf16>, memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
%subview_1 = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
}
}
}
}
return
}
// -----// IR Dump After CSE (cse) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c540 = arith.constant 540 : index
%c3200 = arith.constant 3200 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f16
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x64x16xf16>
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %14, 64 : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%13}
memref.assume_alignment %15, 1 : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %16 to %13 step %17 {
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%subview = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %14[%23, %arg2] [64, 64] [1, 1] : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %22 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c16 {
%24 = vector.transfer_read %subview_0[%c0, %arg4], %cst {in_bounds = [true, true]} : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x16xf16>
%25 = vector.broadcast %24 : vector<64x16xf16> to vector<1x64x16xf16>
vector.transfer_write %25, %alloca[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x16xf16>, memref<1x64x16xf16>
%expand_shape = memref.expand_shape %alloca [[0], [1, 2], [3, 4]] : memref<1x64x16xf16> into memref<1x4x16x16x1xf16>
%26 = vector.transfer_read %expand_shape[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : memref<1x4x16x16x1xf16>, vector<1x4x16x16x1xf16>
%27 = vector.transpose %26, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
vector.transfer_write %27, %subview[%arg3, %c0, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x4x16x16x1xf16>, memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%subview : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
}
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c540 = arith.constant 540 : index
%c3200 = arith.constant 3200 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f16
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x64x16xf16>
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %14, 64 : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%13}
memref.assume_alignment %15, 1 : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %16 to %13 step %17 {
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%subview = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %14[%23, %arg2] [64, 64] [1, 1] : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %22 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c16 {
%24 = vector.transfer_read %subview_0[%c0, %arg4], %cst {in_bounds = [true, true]} : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x16xf16>
%25 = vector.broadcast %24 : vector<64x16xf16> to vector<1x64x16xf16>
vector.transfer_write %25, %alloca[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x16xf16>, memref<1x64x16xf16>
%expand_shape = memref.expand_shape %alloca [[0], [1, 2], [3, 4]] : memref<1x64x16xf16> into memref<1x4x16x16x1xf16>
%26 = vector.transfer_read %expand_shape[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : memref<1x4x16x16x1xf16>, vector<1x4x16x16x1xf16>
%27 = vector.transpose %26, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
vector.transfer_write %27, %subview[%arg3, %c0, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x4x16x16x1xf16>, memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
}
}
return
}
// -----// IR Dump After CleanupBufferAllocView (iree-codegen-cleanup-buffer-alloc-view) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c540 = arith.constant 540 : index
%c3200 = arith.constant 3200 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f16
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x64x16xf16>
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %14, 64 : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%13}
memref.assume_alignment %15, 1 : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %16 to %13 step %17 {
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%subview = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %14[%23, %arg2] [64, 64] [1, 1] : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %22 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c16 {
%24 = vector.transfer_read %subview_0[%c0, %arg4], %cst {in_bounds = [true, true]} : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x16xf16>
%25 = vector.broadcast %24 : vector<64x16xf16> to vector<1x64x16xf16>
vector.transfer_write %25, %alloca[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x16xf16>, memref<1x64x16xf16>
%expand_shape = memref.expand_shape %alloca [[0], [1, 2], [3, 4]] : memref<1x64x16xf16> into memref<1x4x16x16x1xf16>
%26 = vector.transfer_read %expand_shape[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : memref<1x4x16x16x1xf16>, vector<1x4x16x16x1xf16>
%27 = vector.transpose %26, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
vector.transfer_write %27, %subview[%arg3, %c0, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x4x16x16x1xf16>, memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
}
}
return
}
// -----// IR Dump After RemoveSingleIterationLoop (iree-codegen-remove-single-iteration-loop) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c540 = arith.constant 540 : index
%c3200 = arith.constant 3200 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f16
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x64x16xf16>
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %14, 64 : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%13}
memref.assume_alignment %15, 1 : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %16 to %13 step %17 {
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%subview = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %14[%23, %arg2] [64, 64] [1, 1] : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %22 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c16 {
%24 = vector.transfer_read %subview_0[%c0, %arg4], %cst {in_bounds = [true, true]} : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x16xf16>
%25 = vector.broadcast %24 : vector<64x16xf16> to vector<1x64x16xf16>
vector.transfer_write %25, %alloca[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x64x16xf16>, memref<1x64x16xf16>
%expand_shape = memref.expand_shape %alloca [[0], [1, 2], [3, 4]] : memref<1x64x16xf16> into memref<1x4x16x16x1xf16>
%26 = vector.transfer_read %expand_shape[%c0, %c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true, true]} : memref<1x4x16x16x1xf16>, vector<1x4x16x16x1xf16>
%27 = vector.transpose %26, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
vector.transfer_write %27, %subview[%arg3, %c0, %arg4, %c0, %c0] {in_bounds = [true, true, true, true, true]} : vector<1x4x16x16x1xf16>, memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
}
}
return
}
// -----// IR Dump After LLVMCPUDropVectorUnitDims (iree-llvmcpu-drop-vector-unit-dims) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c540 = arith.constant 540 : index
%c3200 = arith.constant 3200 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f16
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x64x16xf16>
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %14, 64 : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%13}
memref.assume_alignment %15, 1 : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %16 to %13 step %17 {
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%subview = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %14[%23, %arg2] [64, 64] [1, 1] : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %22 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c16 {
%24 = vector.transfer_read %subview_0[%c0, %arg4], %cst {in_bounds = [true, true]} : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x16xf16>
%subview_1 = memref.subview %alloca[0, 0, 0] [1, 64, 16] [1, 1, 1] : memref<1x64x16xf16> to memref<64x16xf16>
vector.transfer_write %24, %subview_1[%c0, %c0] {in_bounds = [true, true]} : vector<64x16xf16>, memref<64x16xf16>
%expand_shape = memref.expand_shape %alloca [[0], [1, 2], [3, 4]] : memref<1x64x16xf16> into memref<1x4x16x16x1xf16>
%subview_2 = memref.subview %expand_shape[0, 0, 0, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : memref<1x4x16x16x1xf16> to memref<1x4x16x16xf16, strided<[1024, 256, 16, 1]>>
%subview_3 = memref.subview %subview_2[0, 0, 0, 0] [1, 4, 16, 16] [1, 1, 1, 1] : memref<1x4x16x16xf16, strided<[1024, 256, 16, 1]>> to memref<4x16x16xf16>
%25 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<4x16x16xf16>, vector<4x16x16xf16>
%26 = vector.shape_cast %25 : vector<4x16x16xf16> to vector<4x16x16x1xf16>
%27 = vector.broadcast %26 : vector<4x16x16x1xf16> to vector<1x4x16x16x1xf16>
%28 = vector.transpose %27, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
%29 = vector.extract %28[0] : vector<4x16x16x1xf16> from vector<1x4x16x16x1xf16>
%subview_4 = memref.subview %subview[0, 0, 0, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%30 = vector.shape_cast %29 : vector<4x16x16x1xf16> to vector<4x16x16xf16>
vector.transfer_write %30, %subview_4[%arg3, %c0, %arg4, %c0] {in_bounds = [true, true, true]} : vector<4x16x16xf16>, memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
}
}
return
}
// -----// IR Dump After LLVMCPUVirtualVectorLowering (iree-llvmcpu-virtual-vector-lowering) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c540 = arith.constant 540 : index
%c3200 = arith.constant 3200 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f16
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x64x16xf16>
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %14, 64 : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%13}
memref.assume_alignment %15, 1 : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %16 to %13 step %17 {
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%subview = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %14[%23, %arg2] [64, 64] [1, 1] : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %22 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c16 {
%24 = vector.transfer_read %subview_0[%c0, %arg4], %cst {in_bounds = [true, true]} : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x16xf16>
%subview_1 = memref.subview %alloca[0, 0, 0] [1, 64, 16] [1, 1, 1] : memref<1x64x16xf16> to memref<64x16xf16>
vector.transfer_write %24, %subview_1[%c0, %c0] {in_bounds = [true, true]} : vector<64x16xf16>, memref<64x16xf16>
%expand_shape = memref.expand_shape %alloca [[0], [1, 2], [3, 4]] : memref<1x64x16xf16> into memref<1x4x16x16x1xf16>
%subview_2 = memref.subview %expand_shape[0, 0, 0, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : memref<1x4x16x16x1xf16> to memref<1x4x16x16xf16, strided<[1024, 256, 16, 1]>>
%subview_3 = memref.subview %subview_2[0, 0, 0, 0] [1, 4, 16, 16] [1, 1, 1, 1] : memref<1x4x16x16xf16, strided<[1024, 256, 16, 1]>> to memref<4x16x16xf16>
%25 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<4x16x16xf16>, vector<4x16x16xf16>
%26 = vector.shape_cast %25 : vector<4x16x16xf16> to vector<4x16x16x1xf16>
%27 = vector.broadcast %26 : vector<4x16x16x1xf16> to vector<1x4x16x16x1xf16>
%28 = vector.transpose %27, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
%29 = vector.extract %28[0] : vector<4x16x16x1xf16> from vector<1x4x16x16x1xf16>
%subview_4 = memref.subview %subview[0, 0, 0, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%30 = vector.shape_cast %29 : vector<4x16x16x1xf16> to vector<4x16x16xf16>
vector.transfer_write %30, %subview_4[%arg3, %c0, %arg4, %c0] {in_bounds = [true, true, true]} : vector<4x16x16xf16>, memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c540 = arith.constant 540 : index
%c3200 = arith.constant 3200 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f16
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x64x16xf16>
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %14, 64 : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%13}
memref.assume_alignment %15, 1 : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %16 to %13 step %17 {
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%subview = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %14[%23, %arg2] [64, 64] [1, 1] : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %22 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c16 {
%24 = vector.transfer_read %subview_0[%c0, %arg4], %cst {in_bounds = [true, true]} : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<64x16xf16>
%subview_1 = memref.subview %alloca[0, 0, 0] [1, 64, 16] [1, 1, 1] : memref<1x64x16xf16> to memref<64x16xf16>
vector.transfer_write %24, %subview_1[%c0, %c0] {in_bounds = [true, true]} : vector<64x16xf16>, memref<64x16xf16>
%expand_shape = memref.expand_shape %alloca [[0], [1, 2], [3, 4]] : memref<1x64x16xf16> into memref<1x4x16x16x1xf16>
%subview_2 = memref.subview %expand_shape[0, 0, 0, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : memref<1x4x16x16x1xf16> to memref<1x4x16x16xf16, strided<[1024, 256, 16, 1]>>
%subview_3 = memref.subview %subview_2[0, 0, 0, 0] [1, 4, 16, 16] [1, 1, 1, 1] : memref<1x4x16x16xf16, strided<[1024, 256, 16, 1]>> to memref<4x16x16xf16>
%25 = vector.transfer_read %subview_3[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<4x16x16xf16>, vector<4x16x16xf16>
%26 = vector.shape_cast %25 : vector<4x16x16xf16> to vector<4x16x16x1xf16>
%27 = vector.broadcast %26 : vector<4x16x16x1xf16> to vector<1x4x16x16x1xf16>
%28 = vector.transpose %27, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
%29 = vector.extract %28[0] : vector<4x16x16x1xf16> from vector<1x4x16x16x1xf16>
%subview_4 = memref.subview %subview[0, 0, 0, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%30 = vector.shape_cast %29 : vector<4x16x16x1xf16> to vector<4x16x16xf16>
vector.transfer_write %30, %subview_4[%arg3, %c0, %arg4, %c0] {in_bounds = [true, true, true]} : vector<4x16x16xf16>, memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
}
}
}
}
}
return
}
// -----// IR Dump After LLVMCPUVectorTransferLowering (iree-llvmcpu-vector-transfer-lowering) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%cst = arith.constant dense<0.000000e+00> : vector<4x16x16xf16>
%c63 = arith.constant 63 : index
%c62 = arith.constant 62 : index
%c61 = arith.constant 61 : index
%c60 = arith.constant 60 : index
%c59 = arith.constant 59 : index
%c58 = arith.constant 58 : index
%c57 = arith.constant 57 : index
%c56 = arith.constant 56 : index
%c55 = arith.constant 55 : index
%c54 = arith.constant 54 : index
%c53 = arith.constant 53 : index
%c52 = arith.constant 52 : index
%c51 = arith.constant 51 : index
%c50 = arith.constant 50 : index
%c49 = arith.constant 49 : index
%c48 = arith.constant 48 : index
%c47 = arith.constant 47 : index
%c46 = arith.constant 46 : index
%c45 = arith.constant 45 : index
%c44 = arith.constant 44 : index
%c43 = arith.constant 43 : index
%c42 = arith.constant 42 : index
%c41 = arith.constant 41 : index
%c40 = arith.constant 40 : index
%c39 = arith.constant 39 : index
%c38 = arith.constant 38 : index
%c37 = arith.constant 37 : index
%c36 = arith.constant 36 : index
%c35 = arith.constant 35 : index
%c34 = arith.constant 34 : index
%c33 = arith.constant 33 : index
%c32 = arith.constant 32 : index
%c31 = arith.constant 31 : index
%c30 = arith.constant 30 : index
%c29 = arith.constant 29 : index
%c28 = arith.constant 28 : index
%c27 = arith.constant 27 : index
%c26 = arith.constant 26 : index
%c25 = arith.constant 25 : index
%c24 = arith.constant 24 : index
%c23 = arith.constant 23 : index
%c22 = arith.constant 22 : index
%c21 = arith.constant 21 : index
%c20 = arith.constant 20 : index
%c19 = arith.constant 19 : index
%c18 = arith.constant 18 : index
%c17 = arith.constant 17 : index
%c15 = arith.constant 15 : index
%c14 = arith.constant 14 : index
%c13 = arith.constant 13 : index
%c12 = arith.constant 12 : index
%c11 = arith.constant 11 : index
%c10 = arith.constant 10 : index
%c9 = arith.constant 9 : index
%c8 = arith.constant 8 : index
%c7 = arith.constant 7 : index
%c6 = arith.constant 6 : index
%c5 = arith.constant 5 : index
%c4 = arith.constant 4 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c540 = arith.constant 540 : index
%c3200 = arith.constant 3200 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x64x16xf16>
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %14, 64 : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%13}
memref.assume_alignment %15, 1 : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %16 to %13 step %17 {
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%subview = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %14[%23, %arg2] [64, 64] [1, 1] : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %22 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c16 {
%24 = vector.load %subview_0[%c0, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%25 = vector.load %subview_0[%c1, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%26 = vector.load %subview_0[%c2, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%27 = vector.load %subview_0[%c3, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%28 = vector.load %subview_0[%c4, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%29 = vector.load %subview_0[%c5, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%30 = vector.load %subview_0[%c6, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%31 = vector.load %subview_0[%c7, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%32 = vector.load %subview_0[%c8, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%33 = vector.load %subview_0[%c9, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%34 = vector.load %subview_0[%c10, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%35 = vector.load %subview_0[%c11, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%36 = vector.load %subview_0[%c12, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%37 = vector.load %subview_0[%c13, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%38 = vector.load %subview_0[%c14, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%39 = vector.load %subview_0[%c15, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%40 = vector.load %subview_0[%c16, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%41 = vector.load %subview_0[%c17, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%42 = vector.load %subview_0[%c18, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%43 = vector.load %subview_0[%c19, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%44 = vector.load %subview_0[%c20, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%45 = vector.load %subview_0[%c21, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%46 = vector.load %subview_0[%c22, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%47 = vector.load %subview_0[%c23, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%48 = vector.load %subview_0[%c24, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%49 = vector.load %subview_0[%c25, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%50 = vector.load %subview_0[%c26, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%51 = vector.load %subview_0[%c27, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%52 = vector.load %subview_0[%c28, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%53 = vector.load %subview_0[%c29, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%54 = vector.load %subview_0[%c30, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%55 = vector.load %subview_0[%c31, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%56 = vector.load %subview_0[%c32, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%57 = vector.load %subview_0[%c33, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%58 = vector.load %subview_0[%c34, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%59 = vector.load %subview_0[%c35, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%60 = vector.load %subview_0[%c36, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%61 = vector.load %subview_0[%c37, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%62 = vector.load %subview_0[%c38, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%63 = vector.load %subview_0[%c39, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%64 = vector.load %subview_0[%c40, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%65 = vector.load %subview_0[%c41, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%66 = vector.load %subview_0[%c42, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%67 = vector.load %subview_0[%c43, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%68 = vector.load %subview_0[%c44, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%69 = vector.load %subview_0[%c45, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%70 = vector.load %subview_0[%c46, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%71 = vector.load %subview_0[%c47, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%72 = vector.load %subview_0[%c48, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%73 = vector.load %subview_0[%c49, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%74 = vector.load %subview_0[%c50, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%75 = vector.load %subview_0[%c51, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%76 = vector.load %subview_0[%c52, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%77 = vector.load %subview_0[%c53, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%78 = vector.load %subview_0[%c54, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%79 = vector.load %subview_0[%c55, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%80 = vector.load %subview_0[%c56, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%81 = vector.load %subview_0[%c57, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%82 = vector.load %subview_0[%c58, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%83 = vector.load %subview_0[%c59, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%84 = vector.load %subview_0[%c60, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%85 = vector.load %subview_0[%c61, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%86 = vector.load %subview_0[%c62, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%87 = vector.load %subview_0[%c63, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%subview_1 = memref.subview %alloca[0, 0, 0] [1, 64, 16] [1, 1, 1] : memref<1x64x16xf16> to memref<64x16xf16>
vector.store %24, %subview_1[%c0, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %25, %subview_1[%c1, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %26, %subview_1[%c2, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %27, %subview_1[%c3, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %28, %subview_1[%c4, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %29, %subview_1[%c5, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %30, %subview_1[%c6, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %31, %subview_1[%c7, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %32, %subview_1[%c8, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %33, %subview_1[%c9, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %34, %subview_1[%c10, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %35, %subview_1[%c11, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %36, %subview_1[%c12, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %37, %subview_1[%c13, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %38, %subview_1[%c14, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %39, %subview_1[%c15, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %40, %subview_1[%c16, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %41, %subview_1[%c17, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %42, %subview_1[%c18, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %43, %subview_1[%c19, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %44, %subview_1[%c20, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %45, %subview_1[%c21, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %46, %subview_1[%c22, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %47, %subview_1[%c23, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %48, %subview_1[%c24, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %49, %subview_1[%c25, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %50, %subview_1[%c26, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %51, %subview_1[%c27, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %52, %subview_1[%c28, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %53, %subview_1[%c29, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %54, %subview_1[%c30, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %55, %subview_1[%c31, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %56, %subview_1[%c32, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %57, %subview_1[%c33, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %58, %subview_1[%c34, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %59, %subview_1[%c35, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %60, %subview_1[%c36, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %61, %subview_1[%c37, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %62, %subview_1[%c38, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %63, %subview_1[%c39, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %64, %subview_1[%c40, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %65, %subview_1[%c41, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %66, %subview_1[%c42, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %67, %subview_1[%c43, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %68, %subview_1[%c44, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %69, %subview_1[%c45, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %70, %subview_1[%c46, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %71, %subview_1[%c47, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %72, %subview_1[%c48, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %73, %subview_1[%c49, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %74, %subview_1[%c50, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %75, %subview_1[%c51, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %76, %subview_1[%c52, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %77, %subview_1[%c53, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %78, %subview_1[%c54, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %79, %subview_1[%c55, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %80, %subview_1[%c56, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %81, %subview_1[%c57, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %82, %subview_1[%c58, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %83, %subview_1[%c59, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %84, %subview_1[%c60, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %85, %subview_1[%c61, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %86, %subview_1[%c62, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %87, %subview_1[%c63, %c0] : memref<64x16xf16>, vector<16xf16>
%expand_shape = memref.expand_shape %alloca [[0], [1, 2], [3, 4]] : memref<1x64x16xf16> into memref<1x4x16x16x1xf16>
%subview_2 = memref.subview %expand_shape[0, 0, 0, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : memref<1x4x16x16x1xf16> to memref<1x4x16x16xf16, strided<[1024, 256, 16, 1]>>
%subview_3 = memref.subview %subview_2[0, 0, 0, 0] [1, 4, 16, 16] [1, 1, 1, 1] : memref<1x4x16x16xf16, strided<[1024, 256, 16, 1]>> to memref<4x16x16xf16>
%88 = vector.load %subview_3[%c0, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%89 = vector.insert %88, %cst [0, 0] : vector<16xf16> into vector<4x16x16xf16>
%90 = vector.load %subview_3[%c0, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%91 = vector.insert %90, %89 [0, 1] : vector<16xf16> into vector<4x16x16xf16>
%92 = vector.load %subview_3[%c0, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%93 = vector.insert %92, %91 [0, 2] : vector<16xf16> into vector<4x16x16xf16>
%94 = vector.load %subview_3[%c0, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%95 = vector.insert %94, %93 [0, 3] : vector<16xf16> into vector<4x16x16xf16>
%96 = vector.load %subview_3[%c0, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%97 = vector.insert %96, %95 [0, 4] : vector<16xf16> into vector<4x16x16xf16>
%98 = vector.load %subview_3[%c0, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%99 = vector.insert %98, %97 [0, 5] : vector<16xf16> into vector<4x16x16xf16>
%100 = vector.load %subview_3[%c0, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%101 = vector.insert %100, %99 [0, 6] : vector<16xf16> into vector<4x16x16xf16>
%102 = vector.load %subview_3[%c0, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%103 = vector.insert %102, %101 [0, 7] : vector<16xf16> into vector<4x16x16xf16>
%104 = vector.load %subview_3[%c0, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%105 = vector.insert %104, %103 [0, 8] : vector<16xf16> into vector<4x16x16xf16>
%106 = vector.load %subview_3[%c0, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%107 = vector.insert %106, %105 [0, 9] : vector<16xf16> into vector<4x16x16xf16>
%108 = vector.load %subview_3[%c0, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%109 = vector.insert %108, %107 [0, 10] : vector<16xf16> into vector<4x16x16xf16>
%110 = vector.load %subview_3[%c0, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%111 = vector.insert %110, %109 [0, 11] : vector<16xf16> into vector<4x16x16xf16>
%112 = vector.load %subview_3[%c0, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%113 = vector.insert %112, %111 [0, 12] : vector<16xf16> into vector<4x16x16xf16>
%114 = vector.load %subview_3[%c0, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%115 = vector.insert %114, %113 [0, 13] : vector<16xf16> into vector<4x16x16xf16>
%116 = vector.load %subview_3[%c0, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%117 = vector.insert %116, %115 [0, 14] : vector<16xf16> into vector<4x16x16xf16>
%118 = vector.load %subview_3[%c0, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%119 = vector.insert %118, %117 [0, 15] : vector<16xf16> into vector<4x16x16xf16>
%120 = vector.load %subview_3[%c1, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%121 = vector.insert %120, %119 [1, 0] : vector<16xf16> into vector<4x16x16xf16>
%122 = vector.load %subview_3[%c1, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%123 = vector.insert %122, %121 [1, 1] : vector<16xf16> into vector<4x16x16xf16>
%124 = vector.load %subview_3[%c1, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%125 = vector.insert %124, %123 [1, 2] : vector<16xf16> into vector<4x16x16xf16>
%126 = vector.load %subview_3[%c1, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%127 = vector.insert %126, %125 [1, 3] : vector<16xf16> into vector<4x16x16xf16>
%128 = vector.load %subview_3[%c1, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%129 = vector.insert %128, %127 [1, 4] : vector<16xf16> into vector<4x16x16xf16>
%130 = vector.load %subview_3[%c1, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%131 = vector.insert %130, %129 [1, 5] : vector<16xf16> into vector<4x16x16xf16>
%132 = vector.load %subview_3[%c1, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%133 = vector.insert %132, %131 [1, 6] : vector<16xf16> into vector<4x16x16xf16>
%134 = vector.load %subview_3[%c1, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%135 = vector.insert %134, %133 [1, 7] : vector<16xf16> into vector<4x16x16xf16>
%136 = vector.load %subview_3[%c1, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%137 = vector.insert %136, %135 [1, 8] : vector<16xf16> into vector<4x16x16xf16>
%138 = vector.load %subview_3[%c1, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%139 = vector.insert %138, %137 [1, 9] : vector<16xf16> into vector<4x16x16xf16>
%140 = vector.load %subview_3[%c1, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%141 = vector.insert %140, %139 [1, 10] : vector<16xf16> into vector<4x16x16xf16>
%142 = vector.load %subview_3[%c1, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%143 = vector.insert %142, %141 [1, 11] : vector<16xf16> into vector<4x16x16xf16>
%144 = vector.load %subview_3[%c1, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%145 = vector.insert %144, %143 [1, 12] : vector<16xf16> into vector<4x16x16xf16>
%146 = vector.load %subview_3[%c1, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%147 = vector.insert %146, %145 [1, 13] : vector<16xf16> into vector<4x16x16xf16>
%148 = vector.load %subview_3[%c1, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%149 = vector.insert %148, %147 [1, 14] : vector<16xf16> into vector<4x16x16xf16>
%150 = vector.load %subview_3[%c1, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%151 = vector.insert %150, %149 [1, 15] : vector<16xf16> into vector<4x16x16xf16>
%152 = vector.load %subview_3[%c2, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%153 = vector.insert %152, %151 [2, 0] : vector<16xf16> into vector<4x16x16xf16>
%154 = vector.load %subview_3[%c2, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%155 = vector.insert %154, %153 [2, 1] : vector<16xf16> into vector<4x16x16xf16>
%156 = vector.load %subview_3[%c2, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%157 = vector.insert %156, %155 [2, 2] : vector<16xf16> into vector<4x16x16xf16>
%158 = vector.load %subview_3[%c2, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%159 = vector.insert %158, %157 [2, 3] : vector<16xf16> into vector<4x16x16xf16>
%160 = vector.load %subview_3[%c2, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%161 = vector.insert %160, %159 [2, 4] : vector<16xf16> into vector<4x16x16xf16>
%162 = vector.load %subview_3[%c2, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%163 = vector.insert %162, %161 [2, 5] : vector<16xf16> into vector<4x16x16xf16>
%164 = vector.load %subview_3[%c2, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%165 = vector.insert %164, %163 [2, 6] : vector<16xf16> into vector<4x16x16xf16>
%166 = vector.load %subview_3[%c2, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%167 = vector.insert %166, %165 [2, 7] : vector<16xf16> into vector<4x16x16xf16>
%168 = vector.load %subview_3[%c2, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%169 = vector.insert %168, %167 [2, 8] : vector<16xf16> into vector<4x16x16xf16>
%170 = vector.load %subview_3[%c2, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%171 = vector.insert %170, %169 [2, 9] : vector<16xf16> into vector<4x16x16xf16>
%172 = vector.load %subview_3[%c2, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%173 = vector.insert %172, %171 [2, 10] : vector<16xf16> into vector<4x16x16xf16>
%174 = vector.load %subview_3[%c2, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%175 = vector.insert %174, %173 [2, 11] : vector<16xf16> into vector<4x16x16xf16>
%176 = vector.load %subview_3[%c2, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%177 = vector.insert %176, %175 [2, 12] : vector<16xf16> into vector<4x16x16xf16>
%178 = vector.load %subview_3[%c2, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%179 = vector.insert %178, %177 [2, 13] : vector<16xf16> into vector<4x16x16xf16>
%180 = vector.load %subview_3[%c2, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%181 = vector.insert %180, %179 [2, 14] : vector<16xf16> into vector<4x16x16xf16>
%182 = vector.load %subview_3[%c2, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%183 = vector.insert %182, %181 [2, 15] : vector<16xf16> into vector<4x16x16xf16>
%184 = vector.load %subview_3[%c3, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%185 = vector.insert %184, %183 [3, 0] : vector<16xf16> into vector<4x16x16xf16>
%186 = vector.load %subview_3[%c3, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%187 = vector.insert %186, %185 [3, 1] : vector<16xf16> into vector<4x16x16xf16>
%188 = vector.load %subview_3[%c3, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%189 = vector.insert %188, %187 [3, 2] : vector<16xf16> into vector<4x16x16xf16>
%190 = vector.load %subview_3[%c3, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%191 = vector.insert %190, %189 [3, 3] : vector<16xf16> into vector<4x16x16xf16>
%192 = vector.load %subview_3[%c3, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%193 = vector.insert %192, %191 [3, 4] : vector<16xf16> into vector<4x16x16xf16>
%194 = vector.load %subview_3[%c3, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%195 = vector.insert %194, %193 [3, 5] : vector<16xf16> into vector<4x16x16xf16>
%196 = vector.load %subview_3[%c3, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%197 = vector.insert %196, %195 [3, 6] : vector<16xf16> into vector<4x16x16xf16>
%198 = vector.load %subview_3[%c3, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%199 = vector.insert %198, %197 [3, 7] : vector<16xf16> into vector<4x16x16xf16>
%200 = vector.load %subview_3[%c3, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%201 = vector.insert %200, %199 [3, 8] : vector<16xf16> into vector<4x16x16xf16>
%202 = vector.load %subview_3[%c3, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%203 = vector.insert %202, %201 [3, 9] : vector<16xf16> into vector<4x16x16xf16>
%204 = vector.load %subview_3[%c3, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%205 = vector.insert %204, %203 [3, 10] : vector<16xf16> into vector<4x16x16xf16>
%206 = vector.load %subview_3[%c3, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%207 = vector.insert %206, %205 [3, 11] : vector<16xf16> into vector<4x16x16xf16>
%208 = vector.load %subview_3[%c3, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%209 = vector.insert %208, %207 [3, 12] : vector<16xf16> into vector<4x16x16xf16>
%210 = vector.load %subview_3[%c3, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%211 = vector.insert %210, %209 [3, 13] : vector<16xf16> into vector<4x16x16xf16>
%212 = vector.load %subview_3[%c3, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%213 = vector.insert %212, %211 [3, 14] : vector<16xf16> into vector<4x16x16xf16>
%214 = vector.load %subview_3[%c3, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%215 = vector.insert %214, %213 [3, 15] : vector<16xf16> into vector<4x16x16xf16>
%216 = vector.shape_cast %215 : vector<4x16x16xf16> to vector<4x16x16x1xf16>
%217 = vector.broadcast %216 : vector<4x16x16x1xf16> to vector<1x4x16x16x1xf16>
%218 = vector.transpose %217, [0, 1, 3, 2, 4] : vector<1x4x16x16x1xf16> to vector<1x4x16x16x1xf16>
%219 = vector.extract %218[0] : vector<4x16x16x1xf16> from vector<1x4x16x16x1xf16>
%subview_4 = memref.subview %subview[0, 0, 0, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%220 = vector.shape_cast %219 : vector<4x16x16x1xf16> to vector<4x16x16xf16>
%221 = vector.extract %220[0, 0] : vector<16xf16> from vector<4x16x16xf16>
vector.store %221, %subview_4[%arg3, %c0, %arg4, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%222 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
%223 = vector.extract %220[0, 1] : vector<16xf16> from vector<4x16x16xf16>
vector.store %223, %subview_4[%arg3, %c0, %222, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%224 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
%225 = vector.extract %220[0, 2] : vector<16xf16> from vector<4x16x16xf16>
vector.store %225, %subview_4[%arg3, %c0, %224, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%226 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg4)
%227 = vector.extract %220[0, 3] : vector<16xf16> from vector<4x16x16xf16>
vector.store %227, %subview_4[%arg3, %c0, %226, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%228 = affine.apply affine_map<(d0) -> (d0 + 4)>(%arg4)
%229 = vector.extract %220[0, 4] : vector<16xf16> from vector<4x16x16xf16>
vector.store %229, %subview_4[%arg3, %c0, %228, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%230 = affine.apply affine_map<(d0) -> (d0 + 5)>(%arg4)
%231 = vector.extract %220[0, 5] : vector<16xf16> from vector<4x16x16xf16>
vector.store %231, %subview_4[%arg3, %c0, %230, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%232 = affine.apply affine_map<(d0) -> (d0 + 6)>(%arg4)
%233 = vector.extract %220[0, 6] : vector<16xf16> from vector<4x16x16xf16>
vector.store %233, %subview_4[%arg3, %c0, %232, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%234 = affine.apply affine_map<(d0) -> (d0 + 7)>(%arg4)
%235 = vector.extract %220[0, 7] : vector<16xf16> from vector<4x16x16xf16>
vector.store %235, %subview_4[%arg3, %c0, %234, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%236 = affine.apply affine_map<(d0) -> (d0 + 8)>(%arg4)
%237 = vector.extract %220[0, 8] : vector<16xf16> from vector<4x16x16xf16>
vector.store %237, %subview_4[%arg3, %c0, %236, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%238 = affine.apply affine_map<(d0) -> (d0 + 9)>(%arg4)
%239 = vector.extract %220[0, 9] : vector<16xf16> from vector<4x16x16xf16>
vector.store %239, %subview_4[%arg3, %c0, %238, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%240 = affine.apply affine_map<(d0) -> (d0 + 10)>(%arg4)
%241 = vector.extract %220[0, 10] : vector<16xf16> from vector<4x16x16xf16>
vector.store %241, %subview_4[%arg3, %c0, %240, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%242 = affine.apply affine_map<(d0) -> (d0 + 11)>(%arg4)
%243 = vector.extract %220[0, 11] : vector<16xf16> from vector<4x16x16xf16>
vector.store %243, %subview_4[%arg3, %c0, %242, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%244 = affine.apply affine_map<(d0) -> (d0 + 12)>(%arg4)
%245 = vector.extract %220[0, 12] : vector<16xf16> from vector<4x16x16xf16>
vector.store %245, %subview_4[%arg3, %c0, %244, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%246 = affine.apply affine_map<(d0) -> (d0 + 13)>(%arg4)
%247 = vector.extract %220[0, 13] : vector<16xf16> from vector<4x16x16xf16>
vector.store %247, %subview_4[%arg3, %c0, %246, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%248 = affine.apply affine_map<(d0) -> (d0 + 14)>(%arg4)
%249 = vector.extract %220[0, 14] : vector<16xf16> from vector<4x16x16xf16>
vector.store %249, %subview_4[%arg3, %c0, %248, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%250 = affine.apply affine_map<(d0) -> (d0 + 15)>(%arg4)
%251 = vector.extract %220[0, 15] : vector<16xf16> from vector<4x16x16xf16>
vector.store %251, %subview_4[%arg3, %c0, %250, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%252 = vector.extract %220[1, 0] : vector<16xf16> from vector<4x16x16xf16>
vector.store %252, %subview_4[%arg3, %c1, %arg4, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%253 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
%254 = vector.extract %220[1, 1] : vector<16xf16> from vector<4x16x16xf16>
vector.store %254, %subview_4[%arg3, %c1, %253, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%255 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
%256 = vector.extract %220[1, 2] : vector<16xf16> from vector<4x16x16xf16>
vector.store %256, %subview_4[%arg3, %c1, %255, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%257 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg4)
%258 = vector.extract %220[1, 3] : vector<16xf16> from vector<4x16x16xf16>
vector.store %258, %subview_4[%arg3, %c1, %257, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%259 = affine.apply affine_map<(d0) -> (d0 + 4)>(%arg4)
%260 = vector.extract %220[1, 4] : vector<16xf16> from vector<4x16x16xf16>
vector.store %260, %subview_4[%arg3, %c1, %259, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%261 = affine.apply affine_map<(d0) -> (d0 + 5)>(%arg4)
%262 = vector.extract %220[1, 5] : vector<16xf16> from vector<4x16x16xf16>
vector.store %262, %subview_4[%arg3, %c1, %261, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%263 = affine.apply affine_map<(d0) -> (d0 + 6)>(%arg4)
%264 = vector.extract %220[1, 6] : vector<16xf16> from vector<4x16x16xf16>
vector.store %264, %subview_4[%arg3, %c1, %263, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%265 = affine.apply affine_map<(d0) -> (d0 + 7)>(%arg4)
%266 = vector.extract %220[1, 7] : vector<16xf16> from vector<4x16x16xf16>
vector.store %266, %subview_4[%arg3, %c1, %265, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%267 = affine.apply affine_map<(d0) -> (d0 + 8)>(%arg4)
%268 = vector.extract %220[1, 8] : vector<16xf16> from vector<4x16x16xf16>
vector.store %268, %subview_4[%arg3, %c1, %267, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%269 = affine.apply affine_map<(d0) -> (d0 + 9)>(%arg4)
%270 = vector.extract %220[1, 9] : vector<16xf16> from vector<4x16x16xf16>
vector.store %270, %subview_4[%arg3, %c1, %269, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%271 = affine.apply affine_map<(d0) -> (d0 + 10)>(%arg4)
%272 = vector.extract %220[1, 10] : vector<16xf16> from vector<4x16x16xf16>
vector.store %272, %subview_4[%arg3, %c1, %271, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%273 = affine.apply affine_map<(d0) -> (d0 + 11)>(%arg4)
%274 = vector.extract %220[1, 11] : vector<16xf16> from vector<4x16x16xf16>
vector.store %274, %subview_4[%arg3, %c1, %273, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%275 = affine.apply affine_map<(d0) -> (d0 + 12)>(%arg4)
%276 = vector.extract %220[1, 12] : vector<16xf16> from vector<4x16x16xf16>
vector.store %276, %subview_4[%arg3, %c1, %275, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%277 = affine.apply affine_map<(d0) -> (d0 + 13)>(%arg4)
%278 = vector.extract %220[1, 13] : vector<16xf16> from vector<4x16x16xf16>
vector.store %278, %subview_4[%arg3, %c1, %277, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%279 = affine.apply affine_map<(d0) -> (d0 + 14)>(%arg4)
%280 = vector.extract %220[1, 14] : vector<16xf16> from vector<4x16x16xf16>
vector.store %280, %subview_4[%arg3, %c1, %279, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%281 = affine.apply affine_map<(d0) -> (d0 + 15)>(%arg4)
%282 = vector.extract %220[1, 15] : vector<16xf16> from vector<4x16x16xf16>
vector.store %282, %subview_4[%arg3, %c1, %281, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%283 = vector.extract %220[2, 0] : vector<16xf16> from vector<4x16x16xf16>
vector.store %283, %subview_4[%arg3, %c2, %arg4, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%284 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
%285 = vector.extract %220[2, 1] : vector<16xf16> from vector<4x16x16xf16>
vector.store %285, %subview_4[%arg3, %c2, %284, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%286 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
%287 = vector.extract %220[2, 2] : vector<16xf16> from vector<4x16x16xf16>
vector.store %287, %subview_4[%arg3, %c2, %286, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%288 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg4)
%289 = vector.extract %220[2, 3] : vector<16xf16> from vector<4x16x16xf16>
vector.store %289, %subview_4[%arg3, %c2, %288, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%290 = affine.apply affine_map<(d0) -> (d0 + 4)>(%arg4)
%291 = vector.extract %220[2, 4] : vector<16xf16> from vector<4x16x16xf16>
vector.store %291, %subview_4[%arg3, %c2, %290, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%292 = affine.apply affine_map<(d0) -> (d0 + 5)>(%arg4)
%293 = vector.extract %220[2, 5] : vector<16xf16> from vector<4x16x16xf16>
vector.store %293, %subview_4[%arg3, %c2, %292, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%294 = affine.apply affine_map<(d0) -> (d0 + 6)>(%arg4)
%295 = vector.extract %220[2, 6] : vector<16xf16> from vector<4x16x16xf16>
vector.store %295, %subview_4[%arg3, %c2, %294, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%296 = affine.apply affine_map<(d0) -> (d0 + 7)>(%arg4)
%297 = vector.extract %220[2, 7] : vector<16xf16> from vector<4x16x16xf16>
vector.store %297, %subview_4[%arg3, %c2, %296, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%298 = affine.apply affine_map<(d0) -> (d0 + 8)>(%arg4)
%299 = vector.extract %220[2, 8] : vector<16xf16> from vector<4x16x16xf16>
vector.store %299, %subview_4[%arg3, %c2, %298, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%300 = affine.apply affine_map<(d0) -> (d0 + 9)>(%arg4)
%301 = vector.extract %220[2, 9] : vector<16xf16> from vector<4x16x16xf16>
vector.store %301, %subview_4[%arg3, %c2, %300, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%302 = affine.apply affine_map<(d0) -> (d0 + 10)>(%arg4)
%303 = vector.extract %220[2, 10] : vector<16xf16> from vector<4x16x16xf16>
vector.store %303, %subview_4[%arg3, %c2, %302, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%304 = affine.apply affine_map<(d0) -> (d0 + 11)>(%arg4)
%305 = vector.extract %220[2, 11] : vector<16xf16> from vector<4x16x16xf16>
vector.store %305, %subview_4[%arg3, %c2, %304, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%306 = affine.apply affine_map<(d0) -> (d0 + 12)>(%arg4)
%307 = vector.extract %220[2, 12] : vector<16xf16> from vector<4x16x16xf16>
vector.store %307, %subview_4[%arg3, %c2, %306, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%308 = affine.apply affine_map<(d0) -> (d0 + 13)>(%arg4)
%309 = vector.extract %220[2, 13] : vector<16xf16> from vector<4x16x16xf16>
vector.store %309, %subview_4[%arg3, %c2, %308, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%310 = affine.apply affine_map<(d0) -> (d0 + 14)>(%arg4)
%311 = vector.extract %220[2, 14] : vector<16xf16> from vector<4x16x16xf16>
vector.store %311, %subview_4[%arg3, %c2, %310, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%312 = affine.apply affine_map<(d0) -> (d0 + 15)>(%arg4)
%313 = vector.extract %220[2, 15] : vector<16xf16> from vector<4x16x16xf16>
vector.store %313, %subview_4[%arg3, %c2, %312, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%314 = vector.extract %220[3, 0] : vector<16xf16> from vector<4x16x16xf16>
vector.store %314, %subview_4[%arg3, %c3, %arg4, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%315 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
%316 = vector.extract %220[3, 1] : vector<16xf16> from vector<4x16x16xf16>
vector.store %316, %subview_4[%arg3, %c3, %315, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%317 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
%318 = vector.extract %220[3, 2] : vector<16xf16> from vector<4x16x16xf16>
vector.store %318, %subview_4[%arg3, %c3, %317, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%319 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg4)
%320 = vector.extract %220[3, 3] : vector<16xf16> from vector<4x16x16xf16>
vector.store %320, %subview_4[%arg3, %c3, %319, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%321 = affine.apply affine_map<(d0) -> (d0 + 4)>(%arg4)
%322 = vector.extract %220[3, 4] : vector<16xf16> from vector<4x16x16xf16>
vector.store %322, %subview_4[%arg3, %c3, %321, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%323 = affine.apply affine_map<(d0) -> (d0 + 5)>(%arg4)
%324 = vector.extract %220[3, 5] : vector<16xf16> from vector<4x16x16xf16>
vector.store %324, %subview_4[%arg3, %c3, %323, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%325 = affine.apply affine_map<(d0) -> (d0 + 6)>(%arg4)
%326 = vector.extract %220[3, 6] : vector<16xf16> from vector<4x16x16xf16>
vector.store %326, %subview_4[%arg3, %c3, %325, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%327 = affine.apply affine_map<(d0) -> (d0 + 7)>(%arg4)
%328 = vector.extract %220[3, 7] : vector<16xf16> from vector<4x16x16xf16>
vector.store %328, %subview_4[%arg3, %c3, %327, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%329 = affine.apply affine_map<(d0) -> (d0 + 8)>(%arg4)
%330 = vector.extract %220[3, 8] : vector<16xf16> from vector<4x16x16xf16>
vector.store %330, %subview_4[%arg3, %c3, %329, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%331 = affine.apply affine_map<(d0) -> (d0 + 9)>(%arg4)
%332 = vector.extract %220[3, 9] : vector<16xf16> from vector<4x16x16xf16>
vector.store %332, %subview_4[%arg3, %c3, %331, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%333 = affine.apply affine_map<(d0) -> (d0 + 10)>(%arg4)
%334 = vector.extract %220[3, 10] : vector<16xf16> from vector<4x16x16xf16>
vector.store %334, %subview_4[%arg3, %c3, %333, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%335 = affine.apply affine_map<(d0) -> (d0 + 11)>(%arg4)
%336 = vector.extract %220[3, 11] : vector<16xf16> from vector<4x16x16xf16>
vector.store %336, %subview_4[%arg3, %c3, %335, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%337 = affine.apply affine_map<(d0) -> (d0 + 12)>(%arg4)
%338 = vector.extract %220[3, 12] : vector<16xf16> from vector<4x16x16xf16>
vector.store %338, %subview_4[%arg3, %c3, %337, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%339 = affine.apply affine_map<(d0) -> (d0 + 13)>(%arg4)
%340 = vector.extract %220[3, 13] : vector<16xf16> from vector<4x16x16xf16>
vector.store %340, %subview_4[%arg3, %c3, %339, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%341 = affine.apply affine_map<(d0) -> (d0 + 14)>(%arg4)
%342 = vector.extract %220[3, 14] : vector<16xf16> from vector<4x16x16xf16>
vector.store %342, %subview_4[%arg3, %c3, %341, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%343 = affine.apply affine_map<(d0) -> (d0 + 15)>(%arg4)
%344 = vector.extract %220[3, 15] : vector<16xf16> from vector<4x16x16xf16>
vector.store %344, %subview_4[%arg3, %c3, %343, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
}
}
}
}
}
return
}
// -----// IR Dump After LLVMCPUVectorTransposeLowering (iree-llvmcpu-vector-transpose-lowering) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x4x16x16x1xf16>
%cst_0 = arith.constant dense<0.000000e+00> : vector<4x16x16xf16>
%c63 = arith.constant 63 : index
%c62 = arith.constant 62 : index
%c61 = arith.constant 61 : index
%c60 = arith.constant 60 : index
%c59 = arith.constant 59 : index
%c58 = arith.constant 58 : index
%c57 = arith.constant 57 : index
%c56 = arith.constant 56 : index
%c55 = arith.constant 55 : index
%c54 = arith.constant 54 : index
%c53 = arith.constant 53 : index
%c52 = arith.constant 52 : index
%c51 = arith.constant 51 : index
%c50 = arith.constant 50 : index
%c49 = arith.constant 49 : index
%c48 = arith.constant 48 : index
%c47 = arith.constant 47 : index
%c46 = arith.constant 46 : index
%c45 = arith.constant 45 : index
%c44 = arith.constant 44 : index
%c43 = arith.constant 43 : index
%c42 = arith.constant 42 : index
%c41 = arith.constant 41 : index
%c40 = arith.constant 40 : index
%c39 = arith.constant 39 : index
%c38 = arith.constant 38 : index
%c37 = arith.constant 37 : index
%c36 = arith.constant 36 : index
%c35 = arith.constant 35 : index
%c34 = arith.constant 34 : index
%c33 = arith.constant 33 : index
%c32 = arith.constant 32 : index
%c31 = arith.constant 31 : index
%c30 = arith.constant 30 : index
%c29 = arith.constant 29 : index
%c28 = arith.constant 28 : index
%c27 = arith.constant 27 : index
%c26 = arith.constant 26 : index
%c25 = arith.constant 25 : index
%c24 = arith.constant 24 : index
%c23 = arith.constant 23 : index
%c22 = arith.constant 22 : index
%c21 = arith.constant 21 : index
%c20 = arith.constant 20 : index
%c19 = arith.constant 19 : index
%c18 = arith.constant 18 : index
%c17 = arith.constant 17 : index
%c15 = arith.constant 15 : index
%c14 = arith.constant 14 : index
%c13 = arith.constant 13 : index
%c12 = arith.constant 12 : index
%c11 = arith.constant 11 : index
%c10 = arith.constant 10 : index
%c9 = arith.constant 9 : index
%c8 = arith.constant 8 : index
%c7 = arith.constant 7 : index
%c6 = arith.constant 6 : index
%c5 = arith.constant 5 : index
%c4 = arith.constant 4 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c540 = arith.constant 540 : index
%c3200 = arith.constant 3200 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x64x16xf16>
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %14, 64 : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%13}
memref.assume_alignment %15, 1 : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %16 to %13 step %17 {
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%subview = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %14[%23, %arg2] [64, 64] [1, 1] : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %22 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c16 {
%24 = vector.load %subview_1[%c0, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%25 = vector.load %subview_1[%c1, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%26 = vector.load %subview_1[%c2, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%27 = vector.load %subview_1[%c3, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%28 = vector.load %subview_1[%c4, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%29 = vector.load %subview_1[%c5, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%30 = vector.load %subview_1[%c6, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%31 = vector.load %subview_1[%c7, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%32 = vector.load %subview_1[%c8, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%33 = vector.load %subview_1[%c9, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%34 = vector.load %subview_1[%c10, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%35 = vector.load %subview_1[%c11, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%36 = vector.load %subview_1[%c12, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%37 = vector.load %subview_1[%c13, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%38 = vector.load %subview_1[%c14, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%39 = vector.load %subview_1[%c15, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%40 = vector.load %subview_1[%c16, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%41 = vector.load %subview_1[%c17, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%42 = vector.load %subview_1[%c18, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%43 = vector.load %subview_1[%c19, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%44 = vector.load %subview_1[%c20, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%45 = vector.load %subview_1[%c21, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%46 = vector.load %subview_1[%c22, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%47 = vector.load %subview_1[%c23, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%48 = vector.load %subview_1[%c24, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%49 = vector.load %subview_1[%c25, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%50 = vector.load %subview_1[%c26, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%51 = vector.load %subview_1[%c27, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%52 = vector.load %subview_1[%c28, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%53 = vector.load %subview_1[%c29, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%54 = vector.load %subview_1[%c30, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%55 = vector.load %subview_1[%c31, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%56 = vector.load %subview_1[%c32, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%57 = vector.load %subview_1[%c33, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%58 = vector.load %subview_1[%c34, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%59 = vector.load %subview_1[%c35, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%60 = vector.load %subview_1[%c36, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%61 = vector.load %subview_1[%c37, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%62 = vector.load %subview_1[%c38, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%63 = vector.load %subview_1[%c39, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%64 = vector.load %subview_1[%c40, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%65 = vector.load %subview_1[%c41, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%66 = vector.load %subview_1[%c42, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%67 = vector.load %subview_1[%c43, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%68 = vector.load %subview_1[%c44, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%69 = vector.load %subview_1[%c45, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%70 = vector.load %subview_1[%c46, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%71 = vector.load %subview_1[%c47, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%72 = vector.load %subview_1[%c48, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%73 = vector.load %subview_1[%c49, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%74 = vector.load %subview_1[%c50, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%75 = vector.load %subview_1[%c51, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%76 = vector.load %subview_1[%c52, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%77 = vector.load %subview_1[%c53, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%78 = vector.load %subview_1[%c54, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%79 = vector.load %subview_1[%c55, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%80 = vector.load %subview_1[%c56, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%81 = vector.load %subview_1[%c57, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%82 = vector.load %subview_1[%c58, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%83 = vector.load %subview_1[%c59, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%84 = vector.load %subview_1[%c60, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%85 = vector.load %subview_1[%c61, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%86 = vector.load %subview_1[%c62, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%87 = vector.load %subview_1[%c63, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%subview_2 = memref.subview %alloca[0, 0, 0] [1, 64, 16] [1, 1, 1] : memref<1x64x16xf16> to memref<64x16xf16>
vector.store %24, %subview_2[%c0, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %25, %subview_2[%c1, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %26, %subview_2[%c2, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %27, %subview_2[%c3, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %28, %subview_2[%c4, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %29, %subview_2[%c5, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %30, %subview_2[%c6, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %31, %subview_2[%c7, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %32, %subview_2[%c8, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %33, %subview_2[%c9, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %34, %subview_2[%c10, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %35, %subview_2[%c11, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %36, %subview_2[%c12, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %37, %subview_2[%c13, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %38, %subview_2[%c14, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %39, %subview_2[%c15, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %40, %subview_2[%c16, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %41, %subview_2[%c17, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %42, %subview_2[%c18, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %43, %subview_2[%c19, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %44, %subview_2[%c20, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %45, %subview_2[%c21, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %46, %subview_2[%c22, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %47, %subview_2[%c23, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %48, %subview_2[%c24, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %49, %subview_2[%c25, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %50, %subview_2[%c26, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %51, %subview_2[%c27, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %52, %subview_2[%c28, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %53, %subview_2[%c29, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %54, %subview_2[%c30, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %55, %subview_2[%c31, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %56, %subview_2[%c32, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %57, %subview_2[%c33, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %58, %subview_2[%c34, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %59, %subview_2[%c35, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %60, %subview_2[%c36, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %61, %subview_2[%c37, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %62, %subview_2[%c38, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %63, %subview_2[%c39, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %64, %subview_2[%c40, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %65, %subview_2[%c41, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %66, %subview_2[%c42, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %67, %subview_2[%c43, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %68, %subview_2[%c44, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %69, %subview_2[%c45, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %70, %subview_2[%c46, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %71, %subview_2[%c47, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %72, %subview_2[%c48, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %73, %subview_2[%c49, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %74, %subview_2[%c50, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %75, %subview_2[%c51, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %76, %subview_2[%c52, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %77, %subview_2[%c53, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %78, %subview_2[%c54, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %79, %subview_2[%c55, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %80, %subview_2[%c56, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %81, %subview_2[%c57, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %82, %subview_2[%c58, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %83, %subview_2[%c59, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %84, %subview_2[%c60, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %85, %subview_2[%c61, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %86, %subview_2[%c62, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %87, %subview_2[%c63, %c0] : memref<64x16xf16>, vector<16xf16>
%expand_shape = memref.expand_shape %alloca [[0], [1, 2], [3, 4]] : memref<1x64x16xf16> into memref<1x4x16x16x1xf16>
%subview_3 = memref.subview %expand_shape[0, 0, 0, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : memref<1x4x16x16x1xf16> to memref<1x4x16x16xf16, strided<[1024, 256, 16, 1]>>
%subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 4, 16, 16] [1, 1, 1, 1] : memref<1x4x16x16xf16, strided<[1024, 256, 16, 1]>> to memref<4x16x16xf16>
%88 = vector.load %subview_4[%c0, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%89 = vector.insert %88, %cst_0 [0, 0] : vector<16xf16> into vector<4x16x16xf16>
%90 = vector.load %subview_4[%c0, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%91 = vector.insert %90, %89 [0, 1] : vector<16xf16> into vector<4x16x16xf16>
%92 = vector.load %subview_4[%c0, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%93 = vector.insert %92, %91 [0, 2] : vector<16xf16> into vector<4x16x16xf16>
%94 = vector.load %subview_4[%c0, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%95 = vector.insert %94, %93 [0, 3] : vector<16xf16> into vector<4x16x16xf16>
%96 = vector.load %subview_4[%c0, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%97 = vector.insert %96, %95 [0, 4] : vector<16xf16> into vector<4x16x16xf16>
%98 = vector.load %subview_4[%c0, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%99 = vector.insert %98, %97 [0, 5] : vector<16xf16> into vector<4x16x16xf16>
%100 = vector.load %subview_4[%c0, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%101 = vector.insert %100, %99 [0, 6] : vector<16xf16> into vector<4x16x16xf16>
%102 = vector.load %subview_4[%c0, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%103 = vector.insert %102, %101 [0, 7] : vector<16xf16> into vector<4x16x16xf16>
%104 = vector.load %subview_4[%c0, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%105 = vector.insert %104, %103 [0, 8] : vector<16xf16> into vector<4x16x16xf16>
%106 = vector.load %subview_4[%c0, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%107 = vector.insert %106, %105 [0, 9] : vector<16xf16> into vector<4x16x16xf16>
%108 = vector.load %subview_4[%c0, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%109 = vector.insert %108, %107 [0, 10] : vector<16xf16> into vector<4x16x16xf16>
%110 = vector.load %subview_4[%c0, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%111 = vector.insert %110, %109 [0, 11] : vector<16xf16> into vector<4x16x16xf16>
%112 = vector.load %subview_4[%c0, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%113 = vector.insert %112, %111 [0, 12] : vector<16xf16> into vector<4x16x16xf16>
%114 = vector.load %subview_4[%c0, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%115 = vector.insert %114, %113 [0, 13] : vector<16xf16> into vector<4x16x16xf16>
%116 = vector.load %subview_4[%c0, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%117 = vector.insert %116, %115 [0, 14] : vector<16xf16> into vector<4x16x16xf16>
%118 = vector.load %subview_4[%c0, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%119 = vector.insert %118, %117 [0, 15] : vector<16xf16> into vector<4x16x16xf16>
%120 = vector.load %subview_4[%c1, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%121 = vector.insert %120, %119 [1, 0] : vector<16xf16> into vector<4x16x16xf16>
%122 = vector.load %subview_4[%c1, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%123 = vector.insert %122, %121 [1, 1] : vector<16xf16> into vector<4x16x16xf16>
%124 = vector.load %subview_4[%c1, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%125 = vector.insert %124, %123 [1, 2] : vector<16xf16> into vector<4x16x16xf16>
%126 = vector.load %subview_4[%c1, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%127 = vector.insert %126, %125 [1, 3] : vector<16xf16> into vector<4x16x16xf16>
%128 = vector.load %subview_4[%c1, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%129 = vector.insert %128, %127 [1, 4] : vector<16xf16> into vector<4x16x16xf16>
%130 = vector.load %subview_4[%c1, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%131 = vector.insert %130, %129 [1, 5] : vector<16xf16> into vector<4x16x16xf16>
%132 = vector.load %subview_4[%c1, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%133 = vector.insert %132, %131 [1, 6] : vector<16xf16> into vector<4x16x16xf16>
%134 = vector.load %subview_4[%c1, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%135 = vector.insert %134, %133 [1, 7] : vector<16xf16> into vector<4x16x16xf16>
%136 = vector.load %subview_4[%c1, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%137 = vector.insert %136, %135 [1, 8] : vector<16xf16> into vector<4x16x16xf16>
%138 = vector.load %subview_4[%c1, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%139 = vector.insert %138, %137 [1, 9] : vector<16xf16> into vector<4x16x16xf16>
%140 = vector.load %subview_4[%c1, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%141 = vector.insert %140, %139 [1, 10] : vector<16xf16> into vector<4x16x16xf16>
%142 = vector.load %subview_4[%c1, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%143 = vector.insert %142, %141 [1, 11] : vector<16xf16> into vector<4x16x16xf16>
%144 = vector.load %subview_4[%c1, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%145 = vector.insert %144, %143 [1, 12] : vector<16xf16> into vector<4x16x16xf16>
%146 = vector.load %subview_4[%c1, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%147 = vector.insert %146, %145 [1, 13] : vector<16xf16> into vector<4x16x16xf16>
%148 = vector.load %subview_4[%c1, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%149 = vector.insert %148, %147 [1, 14] : vector<16xf16> into vector<4x16x16xf16>
%150 = vector.load %subview_4[%c1, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%151 = vector.insert %150, %149 [1, 15] : vector<16xf16> into vector<4x16x16xf16>
%152 = vector.load %subview_4[%c2, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%153 = vector.insert %152, %151 [2, 0] : vector<16xf16> into vector<4x16x16xf16>
%154 = vector.load %subview_4[%c2, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%155 = vector.insert %154, %153 [2, 1] : vector<16xf16> into vector<4x16x16xf16>
%156 = vector.load %subview_4[%c2, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%157 = vector.insert %156, %155 [2, 2] : vector<16xf16> into vector<4x16x16xf16>
%158 = vector.load %subview_4[%c2, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%159 = vector.insert %158, %157 [2, 3] : vector<16xf16> into vector<4x16x16xf16>
%160 = vector.load %subview_4[%c2, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%161 = vector.insert %160, %159 [2, 4] : vector<16xf16> into vector<4x16x16xf16>
%162 = vector.load %subview_4[%c2, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%163 = vector.insert %162, %161 [2, 5] : vector<16xf16> into vector<4x16x16xf16>
%164 = vector.load %subview_4[%c2, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%165 = vector.insert %164, %163 [2, 6] : vector<16xf16> into vector<4x16x16xf16>
%166 = vector.load %subview_4[%c2, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%167 = vector.insert %166, %165 [2, 7] : vector<16xf16> into vector<4x16x16xf16>
%168 = vector.load %subview_4[%c2, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%169 = vector.insert %168, %167 [2, 8] : vector<16xf16> into vector<4x16x16xf16>
%170 = vector.load %subview_4[%c2, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%171 = vector.insert %170, %169 [2, 9] : vector<16xf16> into vector<4x16x16xf16>
%172 = vector.load %subview_4[%c2, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%173 = vector.insert %172, %171 [2, 10] : vector<16xf16> into vector<4x16x16xf16>
%174 = vector.load %subview_4[%c2, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%175 = vector.insert %174, %173 [2, 11] : vector<16xf16> into vector<4x16x16xf16>
%176 = vector.load %subview_4[%c2, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%177 = vector.insert %176, %175 [2, 12] : vector<16xf16> into vector<4x16x16xf16>
%178 = vector.load %subview_4[%c2, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%179 = vector.insert %178, %177 [2, 13] : vector<16xf16> into vector<4x16x16xf16>
%180 = vector.load %subview_4[%c2, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%181 = vector.insert %180, %179 [2, 14] : vector<16xf16> into vector<4x16x16xf16>
%182 = vector.load %subview_4[%c2, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%183 = vector.insert %182, %181 [2, 15] : vector<16xf16> into vector<4x16x16xf16>
%184 = vector.load %subview_4[%c3, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%185 = vector.insert %184, %183 [3, 0] : vector<16xf16> into vector<4x16x16xf16>
%186 = vector.load %subview_4[%c3, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%187 = vector.insert %186, %185 [3, 1] : vector<16xf16> into vector<4x16x16xf16>
%188 = vector.load %subview_4[%c3, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%189 = vector.insert %188, %187 [3, 2] : vector<16xf16> into vector<4x16x16xf16>
%190 = vector.load %subview_4[%c3, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%191 = vector.insert %190, %189 [3, 3] : vector<16xf16> into vector<4x16x16xf16>
%192 = vector.load %subview_4[%c3, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%193 = vector.insert %192, %191 [3, 4] : vector<16xf16> into vector<4x16x16xf16>
%194 = vector.load %subview_4[%c3, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%195 = vector.insert %194, %193 [3, 5] : vector<16xf16> into vector<4x16x16xf16>
%196 = vector.load %subview_4[%c3, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%197 = vector.insert %196, %195 [3, 6] : vector<16xf16> into vector<4x16x16xf16>
%198 = vector.load %subview_4[%c3, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%199 = vector.insert %198, %197 [3, 7] : vector<16xf16> into vector<4x16x16xf16>
%200 = vector.load %subview_4[%c3, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%201 = vector.insert %200, %199 [3, 8] : vector<16xf16> into vector<4x16x16xf16>
%202 = vector.load %subview_4[%c3, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%203 = vector.insert %202, %201 [3, 9] : vector<16xf16> into vector<4x16x16xf16>
%204 = vector.load %subview_4[%c3, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%205 = vector.insert %204, %203 [3, 10] : vector<16xf16> into vector<4x16x16xf16>
%206 = vector.load %subview_4[%c3, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%207 = vector.insert %206, %205 [3, 11] : vector<16xf16> into vector<4x16x16xf16>
%208 = vector.load %subview_4[%c3, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%209 = vector.insert %208, %207 [3, 12] : vector<16xf16> into vector<4x16x16xf16>
%210 = vector.load %subview_4[%c3, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%211 = vector.insert %210, %209 [3, 13] : vector<16xf16> into vector<4x16x16xf16>
%212 = vector.load %subview_4[%c3, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%213 = vector.insert %212, %211 [3, 14] : vector<16xf16> into vector<4x16x16xf16>
%214 = vector.load %subview_4[%c3, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%215 = vector.insert %214, %213 [3, 15] : vector<16xf16> into vector<4x16x16xf16>
%216 = vector.shape_cast %215 : vector<4x16x16xf16> to vector<4x16x16x1xf16>
%217 = vector.extract %216[0, 0, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%218 = vector.insert %217, %cst [0, 0, 0, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%219 = vector.extract %216[0, 0, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%220 = vector.insert %219, %218 [0, 0, 1, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%221 = vector.extract %216[0, 0, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%222 = vector.insert %221, %220 [0, 0, 2, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%223 = vector.extract %216[0, 0, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%224 = vector.insert %223, %222 [0, 0, 3, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%225 = vector.extract %216[0, 0, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%226 = vector.insert %225, %224 [0, 0, 4, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%227 = vector.extract %216[0, 0, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%228 = vector.insert %227, %226 [0, 0, 5, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%229 = vector.extract %216[0, 0, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%230 = vector.insert %229, %228 [0, 0, 6, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%231 = vector.extract %216[0, 0, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%232 = vector.insert %231, %230 [0, 0, 7, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%233 = vector.extract %216[0, 0, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%234 = vector.insert %233, %232 [0, 0, 8, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%235 = vector.extract %216[0, 0, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%236 = vector.insert %235, %234 [0, 0, 9, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%237 = vector.extract %216[0, 0, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%238 = vector.insert %237, %236 [0, 0, 10, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%239 = vector.extract %216[0, 0, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%240 = vector.insert %239, %238 [0, 0, 11, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%241 = vector.extract %216[0, 0, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%242 = vector.insert %241, %240 [0, 0, 12, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%243 = vector.extract %216[0, 0, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%244 = vector.insert %243, %242 [0, 0, 13, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%245 = vector.extract %216[0, 0, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%246 = vector.insert %245, %244 [0, 0, 14, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%247 = vector.extract %216[0, 0, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%248 = vector.insert %247, %246 [0, 0, 15, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%249 = vector.extract %216[0, 1, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%250 = vector.insert %249, %248 [0, 0, 0, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%251 = vector.extract %216[0, 1, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%252 = vector.insert %251, %250 [0, 0, 1, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%253 = vector.extract %216[0, 1, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%254 = vector.insert %253, %252 [0, 0, 2, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%255 = vector.extract %216[0, 1, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%256 = vector.insert %255, %254 [0, 0, 3, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%257 = vector.extract %216[0, 1, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%258 = vector.insert %257, %256 [0, 0, 4, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%259 = vector.extract %216[0, 1, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%260 = vector.insert %259, %258 [0, 0, 5, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%261 = vector.extract %216[0, 1, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%262 = vector.insert %261, %260 [0, 0, 6, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%263 = vector.extract %216[0, 1, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%264 = vector.insert %263, %262 [0, 0, 7, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%265 = vector.extract %216[0, 1, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%266 = vector.insert %265, %264 [0, 0, 8, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%267 = vector.extract %216[0, 1, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%268 = vector.insert %267, %266 [0, 0, 9, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%269 = vector.extract %216[0, 1, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%270 = vector.insert %269, %268 [0, 0, 10, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%271 = vector.extract %216[0, 1, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%272 = vector.insert %271, %270 [0, 0, 11, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%273 = vector.extract %216[0, 1, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%274 = vector.insert %273, %272 [0, 0, 12, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%275 = vector.extract %216[0, 1, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%276 = vector.insert %275, %274 [0, 0, 13, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%277 = vector.extract %216[0, 1, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%278 = vector.insert %277, %276 [0, 0, 14, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%279 = vector.extract %216[0, 1, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%280 = vector.insert %279, %278 [0, 0, 15, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%281 = vector.extract %216[0, 2, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%282 = vector.insert %281, %280 [0, 0, 0, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%283 = vector.extract %216[0, 2, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%284 = vector.insert %283, %282 [0, 0, 1, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%285 = vector.extract %216[0, 2, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%286 = vector.insert %285, %284 [0, 0, 2, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%287 = vector.extract %216[0, 2, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%288 = vector.insert %287, %286 [0, 0, 3, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%289 = vector.extract %216[0, 2, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%290 = vector.insert %289, %288 [0, 0, 4, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%291 = vector.extract %216[0, 2, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%292 = vector.insert %291, %290 [0, 0, 5, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%293 = vector.extract %216[0, 2, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%294 = vector.insert %293, %292 [0, 0, 6, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%295 = vector.extract %216[0, 2, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%296 = vector.insert %295, %294 [0, 0, 7, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%297 = vector.extract %216[0, 2, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%298 = vector.insert %297, %296 [0, 0, 8, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%299 = vector.extract %216[0, 2, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%300 = vector.insert %299, %298 [0, 0, 9, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%301 = vector.extract %216[0, 2, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%302 = vector.insert %301, %300 [0, 0, 10, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%303 = vector.extract %216[0, 2, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%304 = vector.insert %303, %302 [0, 0, 11, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%305 = vector.extract %216[0, 2, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%306 = vector.insert %305, %304 [0, 0, 12, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%307 = vector.extract %216[0, 2, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%308 = vector.insert %307, %306 [0, 0, 13, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%309 = vector.extract %216[0, 2, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%310 = vector.insert %309, %308 [0, 0, 14, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%311 = vector.extract %216[0, 2, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%312 = vector.insert %311, %310 [0, 0, 15, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%313 = vector.extract %216[0, 3, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%314 = vector.insert %313, %312 [0, 0, 0, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%315 = vector.extract %216[0, 3, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%316 = vector.insert %315, %314 [0, 0, 1, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%317 = vector.extract %216[0, 3, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%318 = vector.insert %317, %316 [0, 0, 2, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%319 = vector.extract %216[0, 3, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%320 = vector.insert %319, %318 [0, 0, 3, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%321 = vector.extract %216[0, 3, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%322 = vector.insert %321, %320 [0, 0, 4, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%323 = vector.extract %216[0, 3, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%324 = vector.insert %323, %322 [0, 0, 5, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%325 = vector.extract %216[0, 3, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%326 = vector.insert %325, %324 [0, 0, 6, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%327 = vector.extract %216[0, 3, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%328 = vector.insert %327, %326 [0, 0, 7, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%329 = vector.extract %216[0, 3, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%330 = vector.insert %329, %328 [0, 0, 8, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%331 = vector.extract %216[0, 3, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%332 = vector.insert %331, %330 [0, 0, 9, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%333 = vector.extract %216[0, 3, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%334 = vector.insert %333, %332 [0, 0, 10, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%335 = vector.extract %216[0, 3, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%336 = vector.insert %335, %334 [0, 0, 11, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%337 = vector.extract %216[0, 3, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%338 = vector.insert %337, %336 [0, 0, 12, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%339 = vector.extract %216[0, 3, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%340 = vector.insert %339, %338 [0, 0, 13, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%341 = vector.extract %216[0, 3, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%342 = vector.insert %341, %340 [0, 0, 14, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%343 = vector.extract %216[0, 3, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%344 = vector.insert %343, %342 [0, 0, 15, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%345 = vector.extract %216[0, 4, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%346 = vector.insert %345, %344 [0, 0, 0, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%347 = vector.extract %216[0, 4, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%348 = vector.insert %347, %346 [0, 0, 1, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%349 = vector.extract %216[0, 4, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%350 = vector.insert %349, %348 [0, 0, 2, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%351 = vector.extract %216[0, 4, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%352 = vector.insert %351, %350 [0, 0, 3, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%353 = vector.extract %216[0, 4, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%354 = vector.insert %353, %352 [0, 0, 4, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%355 = vector.extract %216[0, 4, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%356 = vector.insert %355, %354 [0, 0, 5, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%357 = vector.extract %216[0, 4, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%358 = vector.insert %357, %356 [0, 0, 6, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%359 = vector.extract %216[0, 4, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%360 = vector.insert %359, %358 [0, 0, 7, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%361 = vector.extract %216[0, 4, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%362 = vector.insert %361, %360 [0, 0, 8, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%363 = vector.extract %216[0, 4, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%364 = vector.insert %363, %362 [0, 0, 9, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%365 = vector.extract %216[0, 4, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%366 = vector.insert %365, %364 [0, 0, 10, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%367 = vector.extract %216[0, 4, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%368 = vector.insert %367, %366 [0, 0, 11, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%369 = vector.extract %216[0, 4, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%370 = vector.insert %369, %368 [0, 0, 12, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%371 = vector.extract %216[0, 4, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%372 = vector.insert %371, %370 [0, 0, 13, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%373 = vector.extract %216[0, 4, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%374 = vector.insert %373, %372 [0, 0, 14, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%375 = vector.extract %216[0, 4, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%376 = vector.insert %375, %374 [0, 0, 15, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%377 = vector.extract %216[0, 5, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%378 = vector.insert %377, %376 [0, 0, 0, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%379 = vector.extract %216[0, 5, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%380 = vector.insert %379, %378 [0, 0, 1, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%381 = vector.extract %216[0, 5, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%382 = vector.insert %381, %380 [0, 0, 2, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%383 = vector.extract %216[0, 5, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%384 = vector.insert %383, %382 [0, 0, 3, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%385 = vector.extract %216[0, 5, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%386 = vector.insert %385, %384 [0, 0, 4, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%387 = vector.extract %216[0, 5, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%388 = vector.insert %387, %386 [0, 0, 5, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%389 = vector.extract %216[0, 5, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%390 = vector.insert %389, %388 [0, 0, 6, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%391 = vector.extract %216[0, 5, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%392 = vector.insert %391, %390 [0, 0, 7, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%393 = vector.extract %216[0, 5, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%394 = vector.insert %393, %392 [0, 0, 8, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%395 = vector.extract %216[0, 5, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%396 = vector.insert %395, %394 [0, 0, 9, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%397 = vector.extract %216[0, 5, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%398 = vector.insert %397, %396 [0, 0, 10, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%399 = vector.extract %216[0, 5, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%400 = vector.insert %399, %398 [0, 0, 11, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%401 = vector.extract %216[0, 5, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%402 = vector.insert %401, %400 [0, 0, 12, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%403 = vector.extract %216[0, 5, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%404 = vector.insert %403, %402 [0, 0, 13, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%405 = vector.extract %216[0, 5, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%406 = vector.insert %405, %404 [0, 0, 14, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%407 = vector.extract %216[0, 5, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%408 = vector.insert %407, %406 [0, 0, 15, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%409 = vector.extract %216[0, 6, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%410 = vector.insert %409, %408 [0, 0, 0, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%411 = vector.extract %216[0, 6, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%412 = vector.insert %411, %410 [0, 0, 1, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%413 = vector.extract %216[0, 6, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%414 = vector.insert %413, %412 [0, 0, 2, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%415 = vector.extract %216[0, 6, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%416 = vector.insert %415, %414 [0, 0, 3, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%417 = vector.extract %216[0, 6, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%418 = vector.insert %417, %416 [0, 0, 4, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%419 = vector.extract %216[0, 6, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%420 = vector.insert %419, %418 [0, 0, 5, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%421 = vector.extract %216[0, 6, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%422 = vector.insert %421, %420 [0, 0, 6, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%423 = vector.extract %216[0, 6, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%424 = vector.insert %423, %422 [0, 0, 7, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%425 = vector.extract %216[0, 6, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%426 = vector.insert %425, %424 [0, 0, 8, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%427 = vector.extract %216[0, 6, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%428 = vector.insert %427, %426 [0, 0, 9, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%429 = vector.extract %216[0, 6, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%430 = vector.insert %429, %428 [0, 0, 10, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%431 = vector.extract %216[0, 6, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%432 = vector.insert %431, %430 [0, 0, 11, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%433 = vector.extract %216[0, 6, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%434 = vector.insert %433, %432 [0, 0, 12, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%435 = vector.extract %216[0, 6, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%436 = vector.insert %435, %434 [0, 0, 13, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%437 = vector.extract %216[0, 6, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%438 = vector.insert %437, %436 [0, 0, 14, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%439 = vector.extract %216[0, 6, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%440 = vector.insert %439, %438 [0, 0, 15, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%441 = vector.extract %216[0, 7, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%442 = vector.insert %441, %440 [0, 0, 0, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%443 = vector.extract %216[0, 7, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%444 = vector.insert %443, %442 [0, 0, 1, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%445 = vector.extract %216[0, 7, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%446 = vector.insert %445, %444 [0, 0, 2, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%447 = vector.extract %216[0, 7, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%448 = vector.insert %447, %446 [0, 0, 3, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%449 = vector.extract %216[0, 7, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%450 = vector.insert %449, %448 [0, 0, 4, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%451 = vector.extract %216[0, 7, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%452 = vector.insert %451, %450 [0, 0, 5, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%453 = vector.extract %216[0, 7, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%454 = vector.insert %453, %452 [0, 0, 6, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%455 = vector.extract %216[0, 7, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%456 = vector.insert %455, %454 [0, 0, 7, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%457 = vector.extract %216[0, 7, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%458 = vector.insert %457, %456 [0, 0, 8, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%459 = vector.extract %216[0, 7, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%460 = vector.insert %459, %458 [0, 0, 9, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%461 = vector.extract %216[0, 7, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%462 = vector.insert %461, %460 [0, 0, 10, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%463 = vector.extract %216[0, 7, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%464 = vector.insert %463, %462 [0, 0, 11, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%465 = vector.extract %216[0, 7, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%466 = vector.insert %465, %464 [0, 0, 12, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%467 = vector.extract %216[0, 7, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%468 = vector.insert %467, %466 [0, 0, 13, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%469 = vector.extract %216[0, 7, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%470 = vector.insert %469, %468 [0, 0, 14, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%471 = vector.extract %216[0, 7, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%472 = vector.insert %471, %470 [0, 0, 15, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%473 = vector.extract %216[0, 8, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%474 = vector.insert %473, %472 [0, 0, 0, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%475 = vector.extract %216[0, 8, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%476 = vector.insert %475, %474 [0, 0, 1, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%477 = vector.extract %216[0, 8, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%478 = vector.insert %477, %476 [0, 0, 2, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%479 = vector.extract %216[0, 8, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%480 = vector.insert %479, %478 [0, 0, 3, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%481 = vector.extract %216[0, 8, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%482 = vector.insert %481, %480 [0, 0, 4, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%483 = vector.extract %216[0, 8, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%484 = vector.insert %483, %482 [0, 0, 5, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%485 = vector.extract %216[0, 8, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%486 = vector.insert %485, %484 [0, 0, 6, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%487 = vector.extract %216[0, 8, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%488 = vector.insert %487, %486 [0, 0, 7, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%489 = vector.extract %216[0, 8, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%490 = vector.insert %489, %488 [0, 0, 8, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%491 = vector.extract %216[0, 8, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%492 = vector.insert %491, %490 [0, 0, 9, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%493 = vector.extract %216[0, 8, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%494 = vector.insert %493, %492 [0, 0, 10, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%495 = vector.extract %216[0, 8, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%496 = vector.insert %495, %494 [0, 0, 11, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%497 = vector.extract %216[0, 8, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%498 = vector.insert %497, %496 [0, 0, 12, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%499 = vector.extract %216[0, 8, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%500 = vector.insert %499, %498 [0, 0, 13, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%501 = vector.extract %216[0, 8, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%502 = vector.insert %501, %500 [0, 0, 14, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%503 = vector.extract %216[0, 8, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%504 = vector.insert %503, %502 [0, 0, 15, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%505 = vector.extract %216[0, 9, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%506 = vector.insert %505, %504 [0, 0, 0, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%507 = vector.extract %216[0, 9, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%508 = vector.insert %507, %506 [0, 0, 1, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%509 = vector.extract %216[0, 9, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%510 = vector.insert %509, %508 [0, 0, 2, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%511 = vector.extract %216[0, 9, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%512 = vector.insert %511, %510 [0, 0, 3, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%513 = vector.extract %216[0, 9, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%514 = vector.insert %513, %512 [0, 0, 4, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%515 = vector.extract %216[0, 9, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%516 = vector.insert %515, %514 [0, 0, 5, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%517 = vector.extract %216[0, 9, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%518 = vector.insert %517, %516 [0, 0, 6, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%519 = vector.extract %216[0, 9, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%520 = vector.insert %519, %518 [0, 0, 7, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%521 = vector.extract %216[0, 9, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%522 = vector.insert %521, %520 [0, 0, 8, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%523 = vector.extract %216[0, 9, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%524 = vector.insert %523, %522 [0, 0, 9, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%525 = vector.extract %216[0, 9, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%526 = vector.insert %525, %524 [0, 0, 10, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%527 = vector.extract %216[0, 9, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%528 = vector.insert %527, %526 [0, 0, 11, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%529 = vector.extract %216[0, 9, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%530 = vector.insert %529, %528 [0, 0, 12, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%531 = vector.extract %216[0, 9, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%532 = vector.insert %531, %530 [0, 0, 13, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%533 = vector.extract %216[0, 9, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%534 = vector.insert %533, %532 [0, 0, 14, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%535 = vector.extract %216[0, 9, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%536 = vector.insert %535, %534 [0, 0, 15, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%537 = vector.extract %216[0, 10, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%538 = vector.insert %537, %536 [0, 0, 0, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%539 = vector.extract %216[0, 10, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%540 = vector.insert %539, %538 [0, 0, 1, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%541 = vector.extract %216[0, 10, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%542 = vector.insert %541, %540 [0, 0, 2, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%543 = vector.extract %216[0, 10, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%544 = vector.insert %543, %542 [0, 0, 3, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%545 = vector.extract %216[0, 10, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%546 = vector.insert %545, %544 [0, 0, 4, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%547 = vector.extract %216[0, 10, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%548 = vector.insert %547, %546 [0, 0, 5, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%549 = vector.extract %216[0, 10, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%550 = vector.insert %549, %548 [0, 0, 6, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%551 = vector.extract %216[0, 10, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%552 = vector.insert %551, %550 [0, 0, 7, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%553 = vector.extract %216[0, 10, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%554 = vector.insert %553, %552 [0, 0, 8, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%555 = vector.extract %216[0, 10, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%556 = vector.insert %555, %554 [0, 0, 9, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%557 = vector.extract %216[0, 10, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%558 = vector.insert %557, %556 [0, 0, 10, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%559 = vector.extract %216[0, 10, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%560 = vector.insert %559, %558 [0, 0, 11, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%561 = vector.extract %216[0, 10, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%562 = vector.insert %561, %560 [0, 0, 12, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%563 = vector.extract %216[0, 10, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%564 = vector.insert %563, %562 [0, 0, 13, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%565 = vector.extract %216[0, 10, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%566 = vector.insert %565, %564 [0, 0, 14, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%567 = vector.extract %216[0, 10, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%568 = vector.insert %567, %566 [0, 0, 15, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%569 = vector.extract %216[0, 11, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%570 = vector.insert %569, %568 [0, 0, 0, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%571 = vector.extract %216[0, 11, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%572 = vector.insert %571, %570 [0, 0, 1, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%573 = vector.extract %216[0, 11, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%574 = vector.insert %573, %572 [0, 0, 2, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%575 = vector.extract %216[0, 11, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%576 = vector.insert %575, %574 [0, 0, 3, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%577 = vector.extract %216[0, 11, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%578 = vector.insert %577, %576 [0, 0, 4, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%579 = vector.extract %216[0, 11, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%580 = vector.insert %579, %578 [0, 0, 5, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%581 = vector.extract %216[0, 11, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%582 = vector.insert %581, %580 [0, 0, 6, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%583 = vector.extract %216[0, 11, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%584 = vector.insert %583, %582 [0, 0, 7, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%585 = vector.extract %216[0, 11, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%586 = vector.insert %585, %584 [0, 0, 8, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%587 = vector.extract %216[0, 11, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%588 = vector.insert %587, %586 [0, 0, 9, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%589 = vector.extract %216[0, 11, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%590 = vector.insert %589, %588 [0, 0, 10, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%591 = vector.extract %216[0, 11, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%592 = vector.insert %591, %590 [0, 0, 11, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%593 = vector.extract %216[0, 11, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%594 = vector.insert %593, %592 [0, 0, 12, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%595 = vector.extract %216[0, 11, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%596 = vector.insert %595, %594 [0, 0, 13, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%597 = vector.extract %216[0, 11, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%598 = vector.insert %597, %596 [0, 0, 14, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%599 = vector.extract %216[0, 11, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%600 = vector.insert %599, %598 [0, 0, 15, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%601 = vector.extract %216[0, 12, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%602 = vector.insert %601, %600 [0, 0, 0, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%603 = vector.extract %216[0, 12, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%604 = vector.insert %603, %602 [0, 0, 1, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%605 = vector.extract %216[0, 12, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%606 = vector.insert %605, %604 [0, 0, 2, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%607 = vector.extract %216[0, 12, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%608 = vector.insert %607, %606 [0, 0, 3, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%609 = vector.extract %216[0, 12, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%610 = vector.insert %609, %608 [0, 0, 4, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%611 = vector.extract %216[0, 12, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%612 = vector.insert %611, %610 [0, 0, 5, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%613 = vector.extract %216[0, 12, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%614 = vector.insert %613, %612 [0, 0, 6, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%615 = vector.extract %216[0, 12, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%616 = vector.insert %615, %614 [0, 0, 7, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%617 = vector.extract %216[0, 12, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%618 = vector.insert %617, %616 [0, 0, 8, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%619 = vector.extract %216[0, 12, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%620 = vector.insert %619, %618 [0, 0, 9, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%621 = vector.extract %216[0, 12, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%622 = vector.insert %621, %620 [0, 0, 10, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%623 = vector.extract %216[0, 12, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%624 = vector.insert %623, %622 [0, 0, 11, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%625 = vector.extract %216[0, 12, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%626 = vector.insert %625, %624 [0, 0, 12, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%627 = vector.extract %216[0, 12, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%628 = vector.insert %627, %626 [0, 0, 13, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%629 = vector.extract %216[0, 12, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%630 = vector.insert %629, %628 [0, 0, 14, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%631 = vector.extract %216[0, 12, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%632 = vector.insert %631, %630 [0, 0, 15, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%633 = vector.extract %216[0, 13, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%634 = vector.insert %633, %632 [0, 0, 0, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%635 = vector.extract %216[0, 13, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%636 = vector.insert %635, %634 [0, 0, 1, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%637 = vector.extract %216[0, 13, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%638 = vector.insert %637, %636 [0, 0, 2, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%639 = vector.extract %216[0, 13, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%640 = vector.insert %639, %638 [0, 0, 3, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%641 = vector.extract %216[0, 13, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%642 = vector.insert %641, %640 [0, 0, 4, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%643 = vector.extract %216[0, 13, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%644 = vector.insert %643, %642 [0, 0, 5, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%645 = vector.extract %216[0, 13, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%646 = vector.insert %645, %644 [0, 0, 6, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%647 = vector.extract %216[0, 13, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%648 = vector.insert %647, %646 [0, 0, 7, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%649 = vector.extract %216[0, 13, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%650 = vector.insert %649, %648 [0, 0, 8, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%651 = vector.extract %216[0, 13, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%652 = vector.insert %651, %650 [0, 0, 9, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%653 = vector.extract %216[0, 13, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%654 = vector.insert %653, %652 [0, 0, 10, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%655 = vector.extract %216[0, 13, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%656 = vector.insert %655, %654 [0, 0, 11, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%657 = vector.extract %216[0, 13, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%658 = vector.insert %657, %656 [0, 0, 12, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%659 = vector.extract %216[0, 13, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%660 = vector.insert %659, %658 [0, 0, 13, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%661 = vector.extract %216[0, 13, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%662 = vector.insert %661, %660 [0, 0, 14, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%663 = vector.extract %216[0, 13, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%664 = vector.insert %663, %662 [0, 0, 15, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%665 = vector.extract %216[0, 14, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%666 = vector.insert %665, %664 [0, 0, 0, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%667 = vector.extract %216[0, 14, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%668 = vector.insert %667, %666 [0, 0, 1, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%669 = vector.extract %216[0, 14, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%670 = vector.insert %669, %668 [0, 0, 2, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%671 = vector.extract %216[0, 14, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%672 = vector.insert %671, %670 [0, 0, 3, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%673 = vector.extract %216[0, 14, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%674 = vector.insert %673, %672 [0, 0, 4, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%675 = vector.extract %216[0, 14, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%676 = vector.insert %675, %674 [0, 0, 5, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%677 = vector.extract %216[0, 14, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%678 = vector.insert %677, %676 [0, 0, 6, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%679 = vector.extract %216[0, 14, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%680 = vector.insert %679, %678 [0, 0, 7, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%681 = vector.extract %216[0, 14, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%682 = vector.insert %681, %680 [0, 0, 8, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%683 = vector.extract %216[0, 14, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%684 = vector.insert %683, %682 [0, 0, 9, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%685 = vector.extract %216[0, 14, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%686 = vector.insert %685, %684 [0, 0, 10, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%687 = vector.extract %216[0, 14, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%688 = vector.insert %687, %686 [0, 0, 11, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%689 = vector.extract %216[0, 14, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%690 = vector.insert %689, %688 [0, 0, 12, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%691 = vector.extract %216[0, 14, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%692 = vector.insert %691, %690 [0, 0, 13, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%693 = vector.extract %216[0, 14, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%694 = vector.insert %693, %692 [0, 0, 14, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%695 = vector.extract %216[0, 14, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%696 = vector.insert %695, %694 [0, 0, 15, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%697 = vector.extract %216[0, 15, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%698 = vector.insert %697, %696 [0, 0, 0, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%699 = vector.extract %216[0, 15, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%700 = vector.insert %699, %698 [0, 0, 1, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%701 = vector.extract %216[0, 15, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%702 = vector.insert %701, %700 [0, 0, 2, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%703 = vector.extract %216[0, 15, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%704 = vector.insert %703, %702 [0, 0, 3, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%705 = vector.extract %216[0, 15, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%706 = vector.insert %705, %704 [0, 0, 4, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%707 = vector.extract %216[0, 15, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%708 = vector.insert %707, %706 [0, 0, 5, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%709 = vector.extract %216[0, 15, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%710 = vector.insert %709, %708 [0, 0, 6, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%711 = vector.extract %216[0, 15, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%712 = vector.insert %711, %710 [0, 0, 7, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%713 = vector.extract %216[0, 15, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%714 = vector.insert %713, %712 [0, 0, 8, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%715 = vector.extract %216[0, 15, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%716 = vector.insert %715, %714 [0, 0, 9, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%717 = vector.extract %216[0, 15, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%718 = vector.insert %717, %716 [0, 0, 10, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%719 = vector.extract %216[0, 15, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%720 = vector.insert %719, %718 [0, 0, 11, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%721 = vector.extract %216[0, 15, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%722 = vector.insert %721, %720 [0, 0, 12, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%723 = vector.extract %216[0, 15, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%724 = vector.insert %723, %722 [0, 0, 13, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%725 = vector.extract %216[0, 15, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%726 = vector.insert %725, %724 [0, 0, 14, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%727 = vector.extract %216[0, 15, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%728 = vector.insert %727, %726 [0, 0, 15, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%729 = vector.extract %216[1, 0, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%730 = vector.insert %729, %728 [0, 1, 0, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%731 = vector.extract %216[1, 0, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%732 = vector.insert %731, %730 [0, 1, 1, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%733 = vector.extract %216[1, 0, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%734 = vector.insert %733, %732 [0, 1, 2, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%735 = vector.extract %216[1, 0, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%736 = vector.insert %735, %734 [0, 1, 3, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%737 = vector.extract %216[1, 0, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%738 = vector.insert %737, %736 [0, 1, 4, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%739 = vector.extract %216[1, 0, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%740 = vector.insert %739, %738 [0, 1, 5, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%741 = vector.extract %216[1, 0, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%742 = vector.insert %741, %740 [0, 1, 6, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%743 = vector.extract %216[1, 0, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%744 = vector.insert %743, %742 [0, 1, 7, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%745 = vector.extract %216[1, 0, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%746 = vector.insert %745, %744 [0, 1, 8, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%747 = vector.extract %216[1, 0, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%748 = vector.insert %747, %746 [0, 1, 9, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%749 = vector.extract %216[1, 0, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%750 = vector.insert %749, %748 [0, 1, 10, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%751 = vector.extract %216[1, 0, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%752 = vector.insert %751, %750 [0, 1, 11, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%753 = vector.extract %216[1, 0, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%754 = vector.insert %753, %752 [0, 1, 12, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%755 = vector.extract %216[1, 0, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%756 = vector.insert %755, %754 [0, 1, 13, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%757 = vector.extract %216[1, 0, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%758 = vector.insert %757, %756 [0, 1, 14, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%759 = vector.extract %216[1, 0, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%760 = vector.insert %759, %758 [0, 1, 15, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%761 = vector.extract %216[1, 1, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%762 = vector.insert %761, %760 [0, 1, 0, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%763 = vector.extract %216[1, 1, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%764 = vector.insert %763, %762 [0, 1, 1, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%765 = vector.extract %216[1, 1, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%766 = vector.insert %765, %764 [0, 1, 2, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%767 = vector.extract %216[1, 1, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%768 = vector.insert %767, %766 [0, 1, 3, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%769 = vector.extract %216[1, 1, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%770 = vector.insert %769, %768 [0, 1, 4, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%771 = vector.extract %216[1, 1, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%772 = vector.insert %771, %770 [0, 1, 5, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%773 = vector.extract %216[1, 1, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%774 = vector.insert %773, %772 [0, 1, 6, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%775 = vector.extract %216[1, 1, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%776 = vector.insert %775, %774 [0, 1, 7, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%777 = vector.extract %216[1, 1, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%778 = vector.insert %777, %776 [0, 1, 8, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%779 = vector.extract %216[1, 1, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%780 = vector.insert %779, %778 [0, 1, 9, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%781 = vector.extract %216[1, 1, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%782 = vector.insert %781, %780 [0, 1, 10, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%783 = vector.extract %216[1, 1, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%784 = vector.insert %783, %782 [0, 1, 11, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%785 = vector.extract %216[1, 1, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%786 = vector.insert %785, %784 [0, 1, 12, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%787 = vector.extract %216[1, 1, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%788 = vector.insert %787, %786 [0, 1, 13, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%789 = vector.extract %216[1, 1, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%790 = vector.insert %789, %788 [0, 1, 14, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%791 = vector.extract %216[1, 1, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%792 = vector.insert %791, %790 [0, 1, 15, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%793 = vector.extract %216[1, 2, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%794 = vector.insert %793, %792 [0, 1, 0, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%795 = vector.extract %216[1, 2, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%796 = vector.insert %795, %794 [0, 1, 1, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%797 = vector.extract %216[1, 2, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%798 = vector.insert %797, %796 [0, 1, 2, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%799 = vector.extract %216[1, 2, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%800 = vector.insert %799, %798 [0, 1, 3, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%801 = vector.extract %216[1, 2, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%802 = vector.insert %801, %800 [0, 1, 4, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%803 = vector.extract %216[1, 2, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%804 = vector.insert %803, %802 [0, 1, 5, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%805 = vector.extract %216[1, 2, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%806 = vector.insert %805, %804 [0, 1, 6, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%807 = vector.extract %216[1, 2, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%808 = vector.insert %807, %806 [0, 1, 7, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%809 = vector.extract %216[1, 2, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%810 = vector.insert %809, %808 [0, 1, 8, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%811 = vector.extract %216[1, 2, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%812 = vector.insert %811, %810 [0, 1, 9, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%813 = vector.extract %216[1, 2, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%814 = vector.insert %813, %812 [0, 1, 10, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%815 = vector.extract %216[1, 2, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%816 = vector.insert %815, %814 [0, 1, 11, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%817 = vector.extract %216[1, 2, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%818 = vector.insert %817, %816 [0, 1, 12, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%819 = vector.extract %216[1, 2, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%820 = vector.insert %819, %818 [0, 1, 13, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%821 = vector.extract %216[1, 2, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%822 = vector.insert %821, %820 [0, 1, 14, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%823 = vector.extract %216[1, 2, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%824 = vector.insert %823, %822 [0, 1, 15, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%825 = vector.extract %216[1, 3, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%826 = vector.insert %825, %824 [0, 1, 0, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%827 = vector.extract %216[1, 3, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%828 = vector.insert %827, %826 [0, 1, 1, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%829 = vector.extract %216[1, 3, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%830 = vector.insert %829, %828 [0, 1, 2, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%831 = vector.extract %216[1, 3, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%832 = vector.insert %831, %830 [0, 1, 3, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%833 = vector.extract %216[1, 3, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%834 = vector.insert %833, %832 [0, 1, 4, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%835 = vector.extract %216[1, 3, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%836 = vector.insert %835, %834 [0, 1, 5, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%837 = vector.extract %216[1, 3, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%838 = vector.insert %837, %836 [0, 1, 6, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%839 = vector.extract %216[1, 3, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%840 = vector.insert %839, %838 [0, 1, 7, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%841 = vector.extract %216[1, 3, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%842 = vector.insert %841, %840 [0, 1, 8, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%843 = vector.extract %216[1, 3, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%844 = vector.insert %843, %842 [0, 1, 9, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%845 = vector.extract %216[1, 3, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%846 = vector.insert %845, %844 [0, 1, 10, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%847 = vector.extract %216[1, 3, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%848 = vector.insert %847, %846 [0, 1, 11, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%849 = vector.extract %216[1, 3, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%850 = vector.insert %849, %848 [0, 1, 12, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%851 = vector.extract %216[1, 3, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%852 = vector.insert %851, %850 [0, 1, 13, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%853 = vector.extract %216[1, 3, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%854 = vector.insert %853, %852 [0, 1, 14, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%855 = vector.extract %216[1, 3, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%856 = vector.insert %855, %854 [0, 1, 15, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%857 = vector.extract %216[1, 4, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%858 = vector.insert %857, %856 [0, 1, 0, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%859 = vector.extract %216[1, 4, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%860 = vector.insert %859, %858 [0, 1, 1, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%861 = vector.extract %216[1, 4, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%862 = vector.insert %861, %860 [0, 1, 2, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%863 = vector.extract %216[1, 4, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%864 = vector.insert %863, %862 [0, 1, 3, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%865 = vector.extract %216[1, 4, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%866 = vector.insert %865, %864 [0, 1, 4, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%867 = vector.extract %216[1, 4, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%868 = vector.insert %867, %866 [0, 1, 5, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%869 = vector.extract %216[1, 4, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%870 = vector.insert %869, %868 [0, 1, 6, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%871 = vector.extract %216[1, 4, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%872 = vector.insert %871, %870 [0, 1, 7, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%873 = vector.extract %216[1, 4, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%874 = vector.insert %873, %872 [0, 1, 8, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%875 = vector.extract %216[1, 4, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%876 = vector.insert %875, %874 [0, 1, 9, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%877 = vector.extract %216[1, 4, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%878 = vector.insert %877, %876 [0, 1, 10, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%879 = vector.extract %216[1, 4, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%880 = vector.insert %879, %878 [0, 1, 11, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%881 = vector.extract %216[1, 4, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%882 = vector.insert %881, %880 [0, 1, 12, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%883 = vector.extract %216[1, 4, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%884 = vector.insert %883, %882 [0, 1, 13, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%885 = vector.extract %216[1, 4, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%886 = vector.insert %885, %884 [0, 1, 14, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%887 = vector.extract %216[1, 4, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%888 = vector.insert %887, %886 [0, 1, 15, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%889 = vector.extract %216[1, 5, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%890 = vector.insert %889, %888 [0, 1, 0, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%891 = vector.extract %216[1, 5, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%892 = vector.insert %891, %890 [0, 1, 1, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%893 = vector.extract %216[1, 5, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%894 = vector.insert %893, %892 [0, 1, 2, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%895 = vector.extract %216[1, 5, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%896 = vector.insert %895, %894 [0, 1, 3, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%897 = vector.extract %216[1, 5, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%898 = vector.insert %897, %896 [0, 1, 4, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%899 = vector.extract %216[1, 5, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%900 = vector.insert %899, %898 [0, 1, 5, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%901 = vector.extract %216[1, 5, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%902 = vector.insert %901, %900 [0, 1, 6, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%903 = vector.extract %216[1, 5, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%904 = vector.insert %903, %902 [0, 1, 7, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%905 = vector.extract %216[1, 5, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%906 = vector.insert %905, %904 [0, 1, 8, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%907 = vector.extract %216[1, 5, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%908 = vector.insert %907, %906 [0, 1, 9, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%909 = vector.extract %216[1, 5, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%910 = vector.insert %909, %908 [0, 1, 10, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%911 = vector.extract %216[1, 5, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%912 = vector.insert %911, %910 [0, 1, 11, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%913 = vector.extract %216[1, 5, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%914 = vector.insert %913, %912 [0, 1, 12, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%915 = vector.extract %216[1, 5, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%916 = vector.insert %915, %914 [0, 1, 13, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%917 = vector.extract %216[1, 5, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%918 = vector.insert %917, %916 [0, 1, 14, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%919 = vector.extract %216[1, 5, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%920 = vector.insert %919, %918 [0, 1, 15, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%921 = vector.extract %216[1, 6, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%922 = vector.insert %921, %920 [0, 1, 0, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%923 = vector.extract %216[1, 6, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%924 = vector.insert %923, %922 [0, 1, 1, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%925 = vector.extract %216[1, 6, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%926 = vector.insert %925, %924 [0, 1, 2, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%927 = vector.extract %216[1, 6, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%928 = vector.insert %927, %926 [0, 1, 3, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%929 = vector.extract %216[1, 6, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%930 = vector.insert %929, %928 [0, 1, 4, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%931 = vector.extract %216[1, 6, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%932 = vector.insert %931, %930 [0, 1, 5, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%933 = vector.extract %216[1, 6, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%934 = vector.insert %933, %932 [0, 1, 6, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%935 = vector.extract %216[1, 6, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%936 = vector.insert %935, %934 [0, 1, 7, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%937 = vector.extract %216[1, 6, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%938 = vector.insert %937, %936 [0, 1, 8, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%939 = vector.extract %216[1, 6, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%940 = vector.insert %939, %938 [0, 1, 9, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%941 = vector.extract %216[1, 6, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%942 = vector.insert %941, %940 [0, 1, 10, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%943 = vector.extract %216[1, 6, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%944 = vector.insert %943, %942 [0, 1, 11, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%945 = vector.extract %216[1, 6, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%946 = vector.insert %945, %944 [0, 1, 12, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%947 = vector.extract %216[1, 6, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%948 = vector.insert %947, %946 [0, 1, 13, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%949 = vector.extract %216[1, 6, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%950 = vector.insert %949, %948 [0, 1, 14, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%951 = vector.extract %216[1, 6, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%952 = vector.insert %951, %950 [0, 1, 15, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%953 = vector.extract %216[1, 7, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%954 = vector.insert %953, %952 [0, 1, 0, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%955 = vector.extract %216[1, 7, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%956 = vector.insert %955, %954 [0, 1, 1, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%957 = vector.extract %216[1, 7, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%958 = vector.insert %957, %956 [0, 1, 2, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%959 = vector.extract %216[1, 7, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%960 = vector.insert %959, %958 [0, 1, 3, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%961 = vector.extract %216[1, 7, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%962 = vector.insert %961, %960 [0, 1, 4, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%963 = vector.extract %216[1, 7, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%964 = vector.insert %963, %962 [0, 1, 5, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%965 = vector.extract %216[1, 7, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%966 = vector.insert %965, %964 [0, 1, 6, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%967 = vector.extract %216[1, 7, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%968 = vector.insert %967, %966 [0, 1, 7, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%969 = vector.extract %216[1, 7, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%970 = vector.insert %969, %968 [0, 1, 8, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%971 = vector.extract %216[1, 7, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%972 = vector.insert %971, %970 [0, 1, 9, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%973 = vector.extract %216[1, 7, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%974 = vector.insert %973, %972 [0, 1, 10, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%975 = vector.extract %216[1, 7, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%976 = vector.insert %975, %974 [0, 1, 11, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%977 = vector.extract %216[1, 7, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%978 = vector.insert %977, %976 [0, 1, 12, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%979 = vector.extract %216[1, 7, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%980 = vector.insert %979, %978 [0, 1, 13, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%981 = vector.extract %216[1, 7, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%982 = vector.insert %981, %980 [0, 1, 14, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%983 = vector.extract %216[1, 7, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%984 = vector.insert %983, %982 [0, 1, 15, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%985 = vector.extract %216[1, 8, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%986 = vector.insert %985, %984 [0, 1, 0, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%987 = vector.extract %216[1, 8, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%988 = vector.insert %987, %986 [0, 1, 1, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%989 = vector.extract %216[1, 8, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%990 = vector.insert %989, %988 [0, 1, 2, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%991 = vector.extract %216[1, 8, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%992 = vector.insert %991, %990 [0, 1, 3, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%993 = vector.extract %216[1, 8, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%994 = vector.insert %993, %992 [0, 1, 4, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%995 = vector.extract %216[1, 8, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%996 = vector.insert %995, %994 [0, 1, 5, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%997 = vector.extract %216[1, 8, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%998 = vector.insert %997, %996 [0, 1, 6, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%999 = vector.extract %216[1, 8, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1000 = vector.insert %999, %998 [0, 1, 7, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1001 = vector.extract %216[1, 8, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1002 = vector.insert %1001, %1000 [0, 1, 8, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1003 = vector.extract %216[1, 8, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1004 = vector.insert %1003, %1002 [0, 1, 9, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1005 = vector.extract %216[1, 8, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1006 = vector.insert %1005, %1004 [0, 1, 10, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1007 = vector.extract %216[1, 8, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1008 = vector.insert %1007, %1006 [0, 1, 11, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1009 = vector.extract %216[1, 8, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1010 = vector.insert %1009, %1008 [0, 1, 12, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1011 = vector.extract %216[1, 8, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1012 = vector.insert %1011, %1010 [0, 1, 13, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1013 = vector.extract %216[1, 8, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1014 = vector.insert %1013, %1012 [0, 1, 14, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1015 = vector.extract %216[1, 8, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1016 = vector.insert %1015, %1014 [0, 1, 15, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1017 = vector.extract %216[1, 9, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1018 = vector.insert %1017, %1016 [0, 1, 0, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1019 = vector.extract %216[1, 9, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1020 = vector.insert %1019, %1018 [0, 1, 1, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1021 = vector.extract %216[1, 9, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1022 = vector.insert %1021, %1020 [0, 1, 2, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1023 = vector.extract %216[1, 9, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1024 = vector.insert %1023, %1022 [0, 1, 3, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1025 = vector.extract %216[1, 9, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1026 = vector.insert %1025, %1024 [0, 1, 4, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1027 = vector.extract %216[1, 9, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1028 = vector.insert %1027, %1026 [0, 1, 5, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1029 = vector.extract %216[1, 9, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1030 = vector.insert %1029, %1028 [0, 1, 6, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1031 = vector.extract %216[1, 9, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1032 = vector.insert %1031, %1030 [0, 1, 7, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1033 = vector.extract %216[1, 9, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1034 = vector.insert %1033, %1032 [0, 1, 8, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1035 = vector.extract %216[1, 9, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1036 = vector.insert %1035, %1034 [0, 1, 9, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1037 = vector.extract %216[1, 9, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1038 = vector.insert %1037, %1036 [0, 1, 10, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1039 = vector.extract %216[1, 9, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1040 = vector.insert %1039, %1038 [0, 1, 11, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1041 = vector.extract %216[1, 9, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1042 = vector.insert %1041, %1040 [0, 1, 12, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1043 = vector.extract %216[1, 9, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1044 = vector.insert %1043, %1042 [0, 1, 13, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1045 = vector.extract %216[1, 9, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1046 = vector.insert %1045, %1044 [0, 1, 14, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1047 = vector.extract %216[1, 9, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1048 = vector.insert %1047, %1046 [0, 1, 15, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1049 = vector.extract %216[1, 10, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1050 = vector.insert %1049, %1048 [0, 1, 0, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1051 = vector.extract %216[1, 10, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1052 = vector.insert %1051, %1050 [0, 1, 1, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1053 = vector.extract %216[1, 10, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1054 = vector.insert %1053, %1052 [0, 1, 2, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1055 = vector.extract %216[1, 10, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1056 = vector.insert %1055, %1054 [0, 1, 3, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1057 = vector.extract %216[1, 10, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1058 = vector.insert %1057, %1056 [0, 1, 4, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1059 = vector.extract %216[1, 10, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1060 = vector.insert %1059, %1058 [0, 1, 5, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1061 = vector.extract %216[1, 10, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1062 = vector.insert %1061, %1060 [0, 1, 6, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1063 = vector.extract %216[1, 10, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1064 = vector.insert %1063, %1062 [0, 1, 7, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1065 = vector.extract %216[1, 10, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1066 = vector.insert %1065, %1064 [0, 1, 8, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1067 = vector.extract %216[1, 10, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1068 = vector.insert %1067, %1066 [0, 1, 9, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1069 = vector.extract %216[1, 10, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1070 = vector.insert %1069, %1068 [0, 1, 10, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1071 = vector.extract %216[1, 10, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1072 = vector.insert %1071, %1070 [0, 1, 11, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1073 = vector.extract %216[1, 10, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1074 = vector.insert %1073, %1072 [0, 1, 12, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1075 = vector.extract %216[1, 10, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1076 = vector.insert %1075, %1074 [0, 1, 13, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1077 = vector.extract %216[1, 10, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1078 = vector.insert %1077, %1076 [0, 1, 14, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1079 = vector.extract %216[1, 10, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1080 = vector.insert %1079, %1078 [0, 1, 15, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1081 = vector.extract %216[1, 11, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1082 = vector.insert %1081, %1080 [0, 1, 0, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1083 = vector.extract %216[1, 11, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1084 = vector.insert %1083, %1082 [0, 1, 1, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1085 = vector.extract %216[1, 11, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1086 = vector.insert %1085, %1084 [0, 1, 2, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1087 = vector.extract %216[1, 11, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1088 = vector.insert %1087, %1086 [0, 1, 3, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1089 = vector.extract %216[1, 11, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1090 = vector.insert %1089, %1088 [0, 1, 4, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1091 = vector.extract %216[1, 11, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1092 = vector.insert %1091, %1090 [0, 1, 5, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1093 = vector.extract %216[1, 11, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1094 = vector.insert %1093, %1092 [0, 1, 6, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1095 = vector.extract %216[1, 11, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1096 = vector.insert %1095, %1094 [0, 1, 7, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1097 = vector.extract %216[1, 11, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1098 = vector.insert %1097, %1096 [0, 1, 8, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1099 = vector.extract %216[1, 11, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1100 = vector.insert %1099, %1098 [0, 1, 9, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1101 = vector.extract %216[1, 11, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1102 = vector.insert %1101, %1100 [0, 1, 10, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1103 = vector.extract %216[1, 11, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1104 = vector.insert %1103, %1102 [0, 1, 11, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1105 = vector.extract %216[1, 11, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1106 = vector.insert %1105, %1104 [0, 1, 12, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1107 = vector.extract %216[1, 11, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1108 = vector.insert %1107, %1106 [0, 1, 13, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1109 = vector.extract %216[1, 11, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1110 = vector.insert %1109, %1108 [0, 1, 14, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1111 = vector.extract %216[1, 11, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1112 = vector.insert %1111, %1110 [0, 1, 15, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1113 = vector.extract %216[1, 12, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1114 = vector.insert %1113, %1112 [0, 1, 0, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1115 = vector.extract %216[1, 12, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1116 = vector.insert %1115, %1114 [0, 1, 1, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1117 = vector.extract %216[1, 12, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1118 = vector.insert %1117, %1116 [0, 1, 2, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1119 = vector.extract %216[1, 12, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1120 = vector.insert %1119, %1118 [0, 1, 3, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1121 = vector.extract %216[1, 12, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1122 = vector.insert %1121, %1120 [0, 1, 4, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1123 = vector.extract %216[1, 12, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1124 = vector.insert %1123, %1122 [0, 1, 5, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1125 = vector.extract %216[1, 12, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1126 = vector.insert %1125, %1124 [0, 1, 6, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1127 = vector.extract %216[1, 12, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1128 = vector.insert %1127, %1126 [0, 1, 7, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1129 = vector.extract %216[1, 12, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1130 = vector.insert %1129, %1128 [0, 1, 8, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1131 = vector.extract %216[1, 12, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1132 = vector.insert %1131, %1130 [0, 1, 9, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1133 = vector.extract %216[1, 12, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1134 = vector.insert %1133, %1132 [0, 1, 10, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1135 = vector.extract %216[1, 12, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1136 = vector.insert %1135, %1134 [0, 1, 11, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1137 = vector.extract %216[1, 12, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1138 = vector.insert %1137, %1136 [0, 1, 12, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1139 = vector.extract %216[1, 12, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1140 = vector.insert %1139, %1138 [0, 1, 13, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1141 = vector.extract %216[1, 12, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1142 = vector.insert %1141, %1140 [0, 1, 14, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1143 = vector.extract %216[1, 12, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1144 = vector.insert %1143, %1142 [0, 1, 15, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1145 = vector.extract %216[1, 13, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1146 = vector.insert %1145, %1144 [0, 1, 0, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1147 = vector.extract %216[1, 13, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1148 = vector.insert %1147, %1146 [0, 1, 1, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1149 = vector.extract %216[1, 13, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1150 = vector.insert %1149, %1148 [0, 1, 2, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1151 = vector.extract %216[1, 13, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1152 = vector.insert %1151, %1150 [0, 1, 3, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1153 = vector.extract %216[1, 13, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1154 = vector.insert %1153, %1152 [0, 1, 4, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1155 = vector.extract %216[1, 13, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1156 = vector.insert %1155, %1154 [0, 1, 5, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1157 = vector.extract %216[1, 13, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1158 = vector.insert %1157, %1156 [0, 1, 6, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1159 = vector.extract %216[1, 13, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1160 = vector.insert %1159, %1158 [0, 1, 7, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1161 = vector.extract %216[1, 13, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1162 = vector.insert %1161, %1160 [0, 1, 8, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1163 = vector.extract %216[1, 13, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1164 = vector.insert %1163, %1162 [0, 1, 9, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1165 = vector.extract %216[1, 13, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1166 = vector.insert %1165, %1164 [0, 1, 10, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1167 = vector.extract %216[1, 13, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1168 = vector.insert %1167, %1166 [0, 1, 11, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1169 = vector.extract %216[1, 13, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1170 = vector.insert %1169, %1168 [0, 1, 12, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1171 = vector.extract %216[1, 13, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1172 = vector.insert %1171, %1170 [0, 1, 13, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1173 = vector.extract %216[1, 13, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1174 = vector.insert %1173, %1172 [0, 1, 14, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1175 = vector.extract %216[1, 13, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1176 = vector.insert %1175, %1174 [0, 1, 15, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1177 = vector.extract %216[1, 14, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1178 = vector.insert %1177, %1176 [0, 1, 0, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1179 = vector.extract %216[1, 14, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1180 = vector.insert %1179, %1178 [0, 1, 1, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1181 = vector.extract %216[1, 14, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1182 = vector.insert %1181, %1180 [0, 1, 2, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1183 = vector.extract %216[1, 14, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1184 = vector.insert %1183, %1182 [0, 1, 3, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1185 = vector.extract %216[1, 14, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1186 = vector.insert %1185, %1184 [0, 1, 4, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1187 = vector.extract %216[1, 14, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1188 = vector.insert %1187, %1186 [0, 1, 5, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1189 = vector.extract %216[1, 14, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1190 = vector.insert %1189, %1188 [0, 1, 6, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1191 = vector.extract %216[1, 14, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1192 = vector.insert %1191, %1190 [0, 1, 7, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1193 = vector.extract %216[1, 14, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1194 = vector.insert %1193, %1192 [0, 1, 8, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1195 = vector.extract %216[1, 14, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1196 = vector.insert %1195, %1194 [0, 1, 9, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1197 = vector.extract %216[1, 14, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1198 = vector.insert %1197, %1196 [0, 1, 10, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1199 = vector.extract %216[1, 14, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1200 = vector.insert %1199, %1198 [0, 1, 11, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1201 = vector.extract %216[1, 14, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1202 = vector.insert %1201, %1200 [0, 1, 12, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1203 = vector.extract %216[1, 14, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1204 = vector.insert %1203, %1202 [0, 1, 13, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1205 = vector.extract %216[1, 14, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1206 = vector.insert %1205, %1204 [0, 1, 14, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1207 = vector.extract %216[1, 14, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1208 = vector.insert %1207, %1206 [0, 1, 15, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1209 = vector.extract %216[1, 15, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1210 = vector.insert %1209, %1208 [0, 1, 0, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1211 = vector.extract %216[1, 15, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1212 = vector.insert %1211, %1210 [0, 1, 1, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1213 = vector.extract %216[1, 15, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1214 = vector.insert %1213, %1212 [0, 1, 2, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1215 = vector.extract %216[1, 15, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1216 = vector.insert %1215, %1214 [0, 1, 3, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1217 = vector.extract %216[1, 15, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1218 = vector.insert %1217, %1216 [0, 1, 4, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1219 = vector.extract %216[1, 15, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1220 = vector.insert %1219, %1218 [0, 1, 5, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1221 = vector.extract %216[1, 15, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1222 = vector.insert %1221, %1220 [0, 1, 6, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1223 = vector.extract %216[1, 15, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1224 = vector.insert %1223, %1222 [0, 1, 7, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1225 = vector.extract %216[1, 15, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1226 = vector.insert %1225, %1224 [0, 1, 8, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1227 = vector.extract %216[1, 15, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1228 = vector.insert %1227, %1226 [0, 1, 9, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1229 = vector.extract %216[1, 15, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1230 = vector.insert %1229, %1228 [0, 1, 10, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1231 = vector.extract %216[1, 15, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1232 = vector.insert %1231, %1230 [0, 1, 11, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1233 = vector.extract %216[1, 15, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1234 = vector.insert %1233, %1232 [0, 1, 12, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1235 = vector.extract %216[1, 15, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1236 = vector.insert %1235, %1234 [0, 1, 13, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1237 = vector.extract %216[1, 15, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1238 = vector.insert %1237, %1236 [0, 1, 14, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1239 = vector.extract %216[1, 15, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1240 = vector.insert %1239, %1238 [0, 1, 15, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1241 = vector.extract %216[2, 0, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1242 = vector.insert %1241, %1240 [0, 2, 0, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1243 = vector.extract %216[2, 0, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1244 = vector.insert %1243, %1242 [0, 2, 1, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1245 = vector.extract %216[2, 0, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1246 = vector.insert %1245, %1244 [0, 2, 2, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1247 = vector.extract %216[2, 0, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1248 = vector.insert %1247, %1246 [0, 2, 3, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1249 = vector.extract %216[2, 0, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1250 = vector.insert %1249, %1248 [0, 2, 4, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1251 = vector.extract %216[2, 0, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1252 = vector.insert %1251, %1250 [0, 2, 5, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1253 = vector.extract %216[2, 0, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1254 = vector.insert %1253, %1252 [0, 2, 6, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1255 = vector.extract %216[2, 0, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1256 = vector.insert %1255, %1254 [0, 2, 7, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1257 = vector.extract %216[2, 0, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1258 = vector.insert %1257, %1256 [0, 2, 8, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1259 = vector.extract %216[2, 0, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1260 = vector.insert %1259, %1258 [0, 2, 9, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1261 = vector.extract %216[2, 0, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1262 = vector.insert %1261, %1260 [0, 2, 10, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1263 = vector.extract %216[2, 0, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1264 = vector.insert %1263, %1262 [0, 2, 11, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1265 = vector.extract %216[2, 0, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1266 = vector.insert %1265, %1264 [0, 2, 12, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1267 = vector.extract %216[2, 0, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1268 = vector.insert %1267, %1266 [0, 2, 13, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1269 = vector.extract %216[2, 0, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1270 = vector.insert %1269, %1268 [0, 2, 14, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1271 = vector.extract %216[2, 0, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1272 = vector.insert %1271, %1270 [0, 2, 15, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1273 = vector.extract %216[2, 1, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1274 = vector.insert %1273, %1272 [0, 2, 0, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1275 = vector.extract %216[2, 1, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1276 = vector.insert %1275, %1274 [0, 2, 1, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1277 = vector.extract %216[2, 1, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1278 = vector.insert %1277, %1276 [0, 2, 2, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1279 = vector.extract %216[2, 1, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1280 = vector.insert %1279, %1278 [0, 2, 3, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1281 = vector.extract %216[2, 1, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1282 = vector.insert %1281, %1280 [0, 2, 4, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1283 = vector.extract %216[2, 1, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1284 = vector.insert %1283, %1282 [0, 2, 5, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1285 = vector.extract %216[2, 1, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1286 = vector.insert %1285, %1284 [0, 2, 6, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1287 = vector.extract %216[2, 1, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1288 = vector.insert %1287, %1286 [0, 2, 7, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1289 = vector.extract %216[2, 1, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1290 = vector.insert %1289, %1288 [0, 2, 8, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1291 = vector.extract %216[2, 1, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1292 = vector.insert %1291, %1290 [0, 2, 9, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1293 = vector.extract %216[2, 1, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1294 = vector.insert %1293, %1292 [0, 2, 10, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1295 = vector.extract %216[2, 1, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1296 = vector.insert %1295, %1294 [0, 2, 11, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1297 = vector.extract %216[2, 1, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1298 = vector.insert %1297, %1296 [0, 2, 12, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1299 = vector.extract %216[2, 1, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1300 = vector.insert %1299, %1298 [0, 2, 13, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1301 = vector.extract %216[2, 1, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1302 = vector.insert %1301, %1300 [0, 2, 14, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1303 = vector.extract %216[2, 1, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1304 = vector.insert %1303, %1302 [0, 2, 15, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1305 = vector.extract %216[2, 2, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1306 = vector.insert %1305, %1304 [0, 2, 0, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1307 = vector.extract %216[2, 2, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1308 = vector.insert %1307, %1306 [0, 2, 1, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1309 = vector.extract %216[2, 2, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1310 = vector.insert %1309, %1308 [0, 2, 2, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1311 = vector.extract %216[2, 2, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1312 = vector.insert %1311, %1310 [0, 2, 3, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1313 = vector.extract %216[2, 2, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1314 = vector.insert %1313, %1312 [0, 2, 4, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1315 = vector.extract %216[2, 2, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1316 = vector.insert %1315, %1314 [0, 2, 5, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1317 = vector.extract %216[2, 2, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1318 = vector.insert %1317, %1316 [0, 2, 6, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1319 = vector.extract %216[2, 2, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1320 = vector.insert %1319, %1318 [0, 2, 7, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1321 = vector.extract %216[2, 2, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1322 = vector.insert %1321, %1320 [0, 2, 8, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1323 = vector.extract %216[2, 2, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1324 = vector.insert %1323, %1322 [0, 2, 9, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1325 = vector.extract %216[2, 2, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1326 = vector.insert %1325, %1324 [0, 2, 10, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1327 = vector.extract %216[2, 2, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1328 = vector.insert %1327, %1326 [0, 2, 11, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1329 = vector.extract %216[2, 2, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1330 = vector.insert %1329, %1328 [0, 2, 12, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1331 = vector.extract %216[2, 2, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1332 = vector.insert %1331, %1330 [0, 2, 13, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1333 = vector.extract %216[2, 2, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1334 = vector.insert %1333, %1332 [0, 2, 14, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1335 = vector.extract %216[2, 2, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1336 = vector.insert %1335, %1334 [0, 2, 15, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1337 = vector.extract %216[2, 3, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1338 = vector.insert %1337, %1336 [0, 2, 0, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1339 = vector.extract %216[2, 3, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1340 = vector.insert %1339, %1338 [0, 2, 1, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1341 = vector.extract %216[2, 3, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1342 = vector.insert %1341, %1340 [0, 2, 2, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1343 = vector.extract %216[2, 3, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1344 = vector.insert %1343, %1342 [0, 2, 3, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1345 = vector.extract %216[2, 3, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1346 = vector.insert %1345, %1344 [0, 2, 4, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1347 = vector.extract %216[2, 3, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1348 = vector.insert %1347, %1346 [0, 2, 5, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1349 = vector.extract %216[2, 3, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1350 = vector.insert %1349, %1348 [0, 2, 6, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1351 = vector.extract %216[2, 3, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1352 = vector.insert %1351, %1350 [0, 2, 7, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1353 = vector.extract %216[2, 3, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1354 = vector.insert %1353, %1352 [0, 2, 8, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1355 = vector.extract %216[2, 3, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1356 = vector.insert %1355, %1354 [0, 2, 9, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1357 = vector.extract %216[2, 3, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1358 = vector.insert %1357, %1356 [0, 2, 10, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1359 = vector.extract %216[2, 3, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1360 = vector.insert %1359, %1358 [0, 2, 11, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1361 = vector.extract %216[2, 3, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1362 = vector.insert %1361, %1360 [0, 2, 12, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1363 = vector.extract %216[2, 3, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1364 = vector.insert %1363, %1362 [0, 2, 13, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1365 = vector.extract %216[2, 3, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1366 = vector.insert %1365, %1364 [0, 2, 14, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1367 = vector.extract %216[2, 3, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1368 = vector.insert %1367, %1366 [0, 2, 15, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1369 = vector.extract %216[2, 4, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1370 = vector.insert %1369, %1368 [0, 2, 0, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1371 = vector.extract %216[2, 4, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1372 = vector.insert %1371, %1370 [0, 2, 1, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1373 = vector.extract %216[2, 4, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1374 = vector.insert %1373, %1372 [0, 2, 2, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1375 = vector.extract %216[2, 4, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1376 = vector.insert %1375, %1374 [0, 2, 3, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1377 = vector.extract %216[2, 4, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1378 = vector.insert %1377, %1376 [0, 2, 4, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1379 = vector.extract %216[2, 4, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1380 = vector.insert %1379, %1378 [0, 2, 5, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1381 = vector.extract %216[2, 4, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1382 = vector.insert %1381, %1380 [0, 2, 6, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1383 = vector.extract %216[2, 4, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1384 = vector.insert %1383, %1382 [0, 2, 7, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1385 = vector.extract %216[2, 4, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1386 = vector.insert %1385, %1384 [0, 2, 8, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1387 = vector.extract %216[2, 4, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1388 = vector.insert %1387, %1386 [0, 2, 9, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1389 = vector.extract %216[2, 4, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1390 = vector.insert %1389, %1388 [0, 2, 10, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1391 = vector.extract %216[2, 4, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1392 = vector.insert %1391, %1390 [0, 2, 11, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1393 = vector.extract %216[2, 4, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1394 = vector.insert %1393, %1392 [0, 2, 12, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1395 = vector.extract %216[2, 4, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1396 = vector.insert %1395, %1394 [0, 2, 13, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1397 = vector.extract %216[2, 4, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1398 = vector.insert %1397, %1396 [0, 2, 14, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1399 = vector.extract %216[2, 4, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1400 = vector.insert %1399, %1398 [0, 2, 15, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1401 = vector.extract %216[2, 5, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1402 = vector.insert %1401, %1400 [0, 2, 0, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1403 = vector.extract %216[2, 5, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1404 = vector.insert %1403, %1402 [0, 2, 1, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1405 = vector.extract %216[2, 5, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1406 = vector.insert %1405, %1404 [0, 2, 2, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1407 = vector.extract %216[2, 5, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1408 = vector.insert %1407, %1406 [0, 2, 3, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1409 = vector.extract %216[2, 5, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1410 = vector.insert %1409, %1408 [0, 2, 4, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1411 = vector.extract %216[2, 5, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1412 = vector.insert %1411, %1410 [0, 2, 5, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1413 = vector.extract %216[2, 5, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1414 = vector.insert %1413, %1412 [0, 2, 6, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1415 = vector.extract %216[2, 5, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1416 = vector.insert %1415, %1414 [0, 2, 7, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1417 = vector.extract %216[2, 5, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1418 = vector.insert %1417, %1416 [0, 2, 8, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1419 = vector.extract %216[2, 5, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1420 = vector.insert %1419, %1418 [0, 2, 9, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1421 = vector.extract %216[2, 5, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1422 = vector.insert %1421, %1420 [0, 2, 10, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1423 = vector.extract %216[2, 5, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1424 = vector.insert %1423, %1422 [0, 2, 11, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1425 = vector.extract %216[2, 5, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1426 = vector.insert %1425, %1424 [0, 2, 12, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1427 = vector.extract %216[2, 5, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1428 = vector.insert %1427, %1426 [0, 2, 13, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1429 = vector.extract %216[2, 5, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1430 = vector.insert %1429, %1428 [0, 2, 14, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1431 = vector.extract %216[2, 5, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1432 = vector.insert %1431, %1430 [0, 2, 15, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1433 = vector.extract %216[2, 6, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1434 = vector.insert %1433, %1432 [0, 2, 0, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1435 = vector.extract %216[2, 6, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1436 = vector.insert %1435, %1434 [0, 2, 1, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1437 = vector.extract %216[2, 6, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1438 = vector.insert %1437, %1436 [0, 2, 2, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1439 = vector.extract %216[2, 6, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1440 = vector.insert %1439, %1438 [0, 2, 3, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1441 = vector.extract %216[2, 6, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1442 = vector.insert %1441, %1440 [0, 2, 4, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1443 = vector.extract %216[2, 6, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1444 = vector.insert %1443, %1442 [0, 2, 5, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1445 = vector.extract %216[2, 6, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1446 = vector.insert %1445, %1444 [0, 2, 6, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1447 = vector.extract %216[2, 6, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1448 = vector.insert %1447, %1446 [0, 2, 7, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1449 = vector.extract %216[2, 6, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1450 = vector.insert %1449, %1448 [0, 2, 8, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1451 = vector.extract %216[2, 6, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1452 = vector.insert %1451, %1450 [0, 2, 9, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1453 = vector.extract %216[2, 6, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1454 = vector.insert %1453, %1452 [0, 2, 10, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1455 = vector.extract %216[2, 6, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1456 = vector.insert %1455, %1454 [0, 2, 11, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1457 = vector.extract %216[2, 6, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1458 = vector.insert %1457, %1456 [0, 2, 12, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1459 = vector.extract %216[2, 6, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1460 = vector.insert %1459, %1458 [0, 2, 13, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1461 = vector.extract %216[2, 6, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1462 = vector.insert %1461, %1460 [0, 2, 14, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1463 = vector.extract %216[2, 6, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1464 = vector.insert %1463, %1462 [0, 2, 15, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1465 = vector.extract %216[2, 7, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1466 = vector.insert %1465, %1464 [0, 2, 0, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1467 = vector.extract %216[2, 7, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1468 = vector.insert %1467, %1466 [0, 2, 1, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1469 = vector.extract %216[2, 7, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1470 = vector.insert %1469, %1468 [0, 2, 2, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1471 = vector.extract %216[2, 7, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1472 = vector.insert %1471, %1470 [0, 2, 3, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1473 = vector.extract %216[2, 7, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1474 = vector.insert %1473, %1472 [0, 2, 4, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1475 = vector.extract %216[2, 7, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1476 = vector.insert %1475, %1474 [0, 2, 5, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1477 = vector.extract %216[2, 7, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1478 = vector.insert %1477, %1476 [0, 2, 6, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1479 = vector.extract %216[2, 7, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1480 = vector.insert %1479, %1478 [0, 2, 7, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1481 = vector.extract %216[2, 7, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1482 = vector.insert %1481, %1480 [0, 2, 8, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1483 = vector.extract %216[2, 7, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1484 = vector.insert %1483, %1482 [0, 2, 9, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1485 = vector.extract %216[2, 7, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1486 = vector.insert %1485, %1484 [0, 2, 10, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1487 = vector.extract %216[2, 7, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1488 = vector.insert %1487, %1486 [0, 2, 11, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1489 = vector.extract %216[2, 7, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1490 = vector.insert %1489, %1488 [0, 2, 12, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1491 = vector.extract %216[2, 7, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1492 = vector.insert %1491, %1490 [0, 2, 13, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1493 = vector.extract %216[2, 7, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1494 = vector.insert %1493, %1492 [0, 2, 14, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1495 = vector.extract %216[2, 7, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1496 = vector.insert %1495, %1494 [0, 2, 15, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1497 = vector.extract %216[2, 8, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1498 = vector.insert %1497, %1496 [0, 2, 0, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1499 = vector.extract %216[2, 8, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1500 = vector.insert %1499, %1498 [0, 2, 1, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1501 = vector.extract %216[2, 8, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1502 = vector.insert %1501, %1500 [0, 2, 2, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1503 = vector.extract %216[2, 8, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1504 = vector.insert %1503, %1502 [0, 2, 3, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1505 = vector.extract %216[2, 8, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1506 = vector.insert %1505, %1504 [0, 2, 4, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1507 = vector.extract %216[2, 8, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1508 = vector.insert %1507, %1506 [0, 2, 5, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1509 = vector.extract %216[2, 8, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1510 = vector.insert %1509, %1508 [0, 2, 6, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1511 = vector.extract %216[2, 8, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1512 = vector.insert %1511, %1510 [0, 2, 7, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1513 = vector.extract %216[2, 8, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1514 = vector.insert %1513, %1512 [0, 2, 8, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1515 = vector.extract %216[2, 8, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1516 = vector.insert %1515, %1514 [0, 2, 9, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1517 = vector.extract %216[2, 8, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1518 = vector.insert %1517, %1516 [0, 2, 10, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1519 = vector.extract %216[2, 8, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1520 = vector.insert %1519, %1518 [0, 2, 11, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1521 = vector.extract %216[2, 8, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1522 = vector.insert %1521, %1520 [0, 2, 12, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1523 = vector.extract %216[2, 8, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1524 = vector.insert %1523, %1522 [0, 2, 13, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1525 = vector.extract %216[2, 8, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1526 = vector.insert %1525, %1524 [0, 2, 14, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1527 = vector.extract %216[2, 8, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1528 = vector.insert %1527, %1526 [0, 2, 15, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1529 = vector.extract %216[2, 9, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1530 = vector.insert %1529, %1528 [0, 2, 0, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1531 = vector.extract %216[2, 9, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1532 = vector.insert %1531, %1530 [0, 2, 1, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1533 = vector.extract %216[2, 9, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1534 = vector.insert %1533, %1532 [0, 2, 2, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1535 = vector.extract %216[2, 9, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1536 = vector.insert %1535, %1534 [0, 2, 3, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1537 = vector.extract %216[2, 9, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1538 = vector.insert %1537, %1536 [0, 2, 4, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1539 = vector.extract %216[2, 9, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1540 = vector.insert %1539, %1538 [0, 2, 5, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1541 = vector.extract %216[2, 9, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1542 = vector.insert %1541, %1540 [0, 2, 6, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1543 = vector.extract %216[2, 9, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1544 = vector.insert %1543, %1542 [0, 2, 7, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1545 = vector.extract %216[2, 9, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1546 = vector.insert %1545, %1544 [0, 2, 8, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1547 = vector.extract %216[2, 9, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1548 = vector.insert %1547, %1546 [0, 2, 9, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1549 = vector.extract %216[2, 9, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1550 = vector.insert %1549, %1548 [0, 2, 10, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1551 = vector.extract %216[2, 9, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1552 = vector.insert %1551, %1550 [0, 2, 11, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1553 = vector.extract %216[2, 9, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1554 = vector.insert %1553, %1552 [0, 2, 12, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1555 = vector.extract %216[2, 9, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1556 = vector.insert %1555, %1554 [0, 2, 13, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1557 = vector.extract %216[2, 9, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1558 = vector.insert %1557, %1556 [0, 2, 14, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1559 = vector.extract %216[2, 9, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1560 = vector.insert %1559, %1558 [0, 2, 15, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1561 = vector.extract %216[2, 10, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1562 = vector.insert %1561, %1560 [0, 2, 0, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1563 = vector.extract %216[2, 10, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1564 = vector.insert %1563, %1562 [0, 2, 1, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1565 = vector.extract %216[2, 10, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1566 = vector.insert %1565, %1564 [0, 2, 2, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1567 = vector.extract %216[2, 10, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1568 = vector.insert %1567, %1566 [0, 2, 3, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1569 = vector.extract %216[2, 10, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1570 = vector.insert %1569, %1568 [0, 2, 4, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1571 = vector.extract %216[2, 10, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1572 = vector.insert %1571, %1570 [0, 2, 5, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1573 = vector.extract %216[2, 10, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1574 = vector.insert %1573, %1572 [0, 2, 6, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1575 = vector.extract %216[2, 10, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1576 = vector.insert %1575, %1574 [0, 2, 7, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1577 = vector.extract %216[2, 10, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1578 = vector.insert %1577, %1576 [0, 2, 8, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1579 = vector.extract %216[2, 10, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1580 = vector.insert %1579, %1578 [0, 2, 9, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1581 = vector.extract %216[2, 10, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1582 = vector.insert %1581, %1580 [0, 2, 10, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1583 = vector.extract %216[2, 10, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1584 = vector.insert %1583, %1582 [0, 2, 11, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1585 = vector.extract %216[2, 10, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1586 = vector.insert %1585, %1584 [0, 2, 12, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1587 = vector.extract %216[2, 10, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1588 = vector.insert %1587, %1586 [0, 2, 13, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1589 = vector.extract %216[2, 10, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1590 = vector.insert %1589, %1588 [0, 2, 14, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1591 = vector.extract %216[2, 10, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1592 = vector.insert %1591, %1590 [0, 2, 15, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1593 = vector.extract %216[2, 11, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1594 = vector.insert %1593, %1592 [0, 2, 0, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1595 = vector.extract %216[2, 11, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1596 = vector.insert %1595, %1594 [0, 2, 1, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1597 = vector.extract %216[2, 11, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1598 = vector.insert %1597, %1596 [0, 2, 2, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1599 = vector.extract %216[2, 11, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1600 = vector.insert %1599, %1598 [0, 2, 3, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1601 = vector.extract %216[2, 11, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1602 = vector.insert %1601, %1600 [0, 2, 4, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1603 = vector.extract %216[2, 11, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1604 = vector.insert %1603, %1602 [0, 2, 5, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1605 = vector.extract %216[2, 11, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1606 = vector.insert %1605, %1604 [0, 2, 6, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1607 = vector.extract %216[2, 11, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1608 = vector.insert %1607, %1606 [0, 2, 7, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1609 = vector.extract %216[2, 11, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1610 = vector.insert %1609, %1608 [0, 2, 8, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1611 = vector.extract %216[2, 11, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1612 = vector.insert %1611, %1610 [0, 2, 9, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1613 = vector.extract %216[2, 11, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1614 = vector.insert %1613, %1612 [0, 2, 10, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1615 = vector.extract %216[2, 11, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1616 = vector.insert %1615, %1614 [0, 2, 11, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1617 = vector.extract %216[2, 11, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1618 = vector.insert %1617, %1616 [0, 2, 12, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1619 = vector.extract %216[2, 11, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1620 = vector.insert %1619, %1618 [0, 2, 13, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1621 = vector.extract %216[2, 11, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1622 = vector.insert %1621, %1620 [0, 2, 14, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1623 = vector.extract %216[2, 11, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1624 = vector.insert %1623, %1622 [0, 2, 15, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1625 = vector.extract %216[2, 12, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1626 = vector.insert %1625, %1624 [0, 2, 0, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1627 = vector.extract %216[2, 12, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1628 = vector.insert %1627, %1626 [0, 2, 1, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1629 = vector.extract %216[2, 12, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1630 = vector.insert %1629, %1628 [0, 2, 2, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1631 = vector.extract %216[2, 12, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1632 = vector.insert %1631, %1630 [0, 2, 3, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1633 = vector.extract %216[2, 12, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1634 = vector.insert %1633, %1632 [0, 2, 4, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1635 = vector.extract %216[2, 12, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1636 = vector.insert %1635, %1634 [0, 2, 5, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1637 = vector.extract %216[2, 12, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1638 = vector.insert %1637, %1636 [0, 2, 6, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1639 = vector.extract %216[2, 12, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1640 = vector.insert %1639, %1638 [0, 2, 7, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1641 = vector.extract %216[2, 12, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1642 = vector.insert %1641, %1640 [0, 2, 8, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1643 = vector.extract %216[2, 12, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1644 = vector.insert %1643, %1642 [0, 2, 9, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1645 = vector.extract %216[2, 12, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1646 = vector.insert %1645, %1644 [0, 2, 10, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1647 = vector.extract %216[2, 12, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1648 = vector.insert %1647, %1646 [0, 2, 11, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1649 = vector.extract %216[2, 12, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1650 = vector.insert %1649, %1648 [0, 2, 12, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1651 = vector.extract %216[2, 12, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1652 = vector.insert %1651, %1650 [0, 2, 13, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1653 = vector.extract %216[2, 12, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1654 = vector.insert %1653, %1652 [0, 2, 14, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1655 = vector.extract %216[2, 12, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1656 = vector.insert %1655, %1654 [0, 2, 15, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1657 = vector.extract %216[2, 13, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1658 = vector.insert %1657, %1656 [0, 2, 0, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1659 = vector.extract %216[2, 13, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1660 = vector.insert %1659, %1658 [0, 2, 1, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1661 = vector.extract %216[2, 13, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1662 = vector.insert %1661, %1660 [0, 2, 2, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1663 = vector.extract %216[2, 13, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1664 = vector.insert %1663, %1662 [0, 2, 3, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1665 = vector.extract %216[2, 13, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1666 = vector.insert %1665, %1664 [0, 2, 4, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1667 = vector.extract %216[2, 13, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1668 = vector.insert %1667, %1666 [0, 2, 5, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1669 = vector.extract %216[2, 13, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1670 = vector.insert %1669, %1668 [0, 2, 6, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1671 = vector.extract %216[2, 13, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1672 = vector.insert %1671, %1670 [0, 2, 7, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1673 = vector.extract %216[2, 13, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1674 = vector.insert %1673, %1672 [0, 2, 8, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1675 = vector.extract %216[2, 13, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1676 = vector.insert %1675, %1674 [0, 2, 9, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1677 = vector.extract %216[2, 13, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1678 = vector.insert %1677, %1676 [0, 2, 10, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1679 = vector.extract %216[2, 13, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1680 = vector.insert %1679, %1678 [0, 2, 11, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1681 = vector.extract %216[2, 13, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1682 = vector.insert %1681, %1680 [0, 2, 12, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1683 = vector.extract %216[2, 13, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1684 = vector.insert %1683, %1682 [0, 2, 13, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1685 = vector.extract %216[2, 13, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1686 = vector.insert %1685, %1684 [0, 2, 14, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1687 = vector.extract %216[2, 13, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1688 = vector.insert %1687, %1686 [0, 2, 15, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1689 = vector.extract %216[2, 14, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1690 = vector.insert %1689, %1688 [0, 2, 0, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1691 = vector.extract %216[2, 14, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1692 = vector.insert %1691, %1690 [0, 2, 1, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1693 = vector.extract %216[2, 14, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1694 = vector.insert %1693, %1692 [0, 2, 2, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1695 = vector.extract %216[2, 14, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1696 = vector.insert %1695, %1694 [0, 2, 3, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1697 = vector.extract %216[2, 14, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1698 = vector.insert %1697, %1696 [0, 2, 4, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1699 = vector.extract %216[2, 14, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1700 = vector.insert %1699, %1698 [0, 2, 5, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1701 = vector.extract %216[2, 14, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1702 = vector.insert %1701, %1700 [0, 2, 6, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1703 = vector.extract %216[2, 14, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1704 = vector.insert %1703, %1702 [0, 2, 7, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1705 = vector.extract %216[2, 14, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1706 = vector.insert %1705, %1704 [0, 2, 8, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1707 = vector.extract %216[2, 14, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1708 = vector.insert %1707, %1706 [0, 2, 9, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1709 = vector.extract %216[2, 14, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1710 = vector.insert %1709, %1708 [0, 2, 10, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1711 = vector.extract %216[2, 14, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1712 = vector.insert %1711, %1710 [0, 2, 11, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1713 = vector.extract %216[2, 14, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1714 = vector.insert %1713, %1712 [0, 2, 12, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1715 = vector.extract %216[2, 14, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1716 = vector.insert %1715, %1714 [0, 2, 13, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1717 = vector.extract %216[2, 14, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1718 = vector.insert %1717, %1716 [0, 2, 14, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1719 = vector.extract %216[2, 14, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1720 = vector.insert %1719, %1718 [0, 2, 15, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1721 = vector.extract %216[2, 15, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1722 = vector.insert %1721, %1720 [0, 2, 0, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1723 = vector.extract %216[2, 15, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1724 = vector.insert %1723, %1722 [0, 2, 1, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1725 = vector.extract %216[2, 15, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1726 = vector.insert %1725, %1724 [0, 2, 2, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1727 = vector.extract %216[2, 15, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1728 = vector.insert %1727, %1726 [0, 2, 3, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1729 = vector.extract %216[2, 15, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1730 = vector.insert %1729, %1728 [0, 2, 4, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1731 = vector.extract %216[2, 15, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1732 = vector.insert %1731, %1730 [0, 2, 5, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1733 = vector.extract %216[2, 15, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1734 = vector.insert %1733, %1732 [0, 2, 6, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1735 = vector.extract %216[2, 15, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1736 = vector.insert %1735, %1734 [0, 2, 7, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1737 = vector.extract %216[2, 15, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1738 = vector.insert %1737, %1736 [0, 2, 8, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1739 = vector.extract %216[2, 15, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1740 = vector.insert %1739, %1738 [0, 2, 9, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1741 = vector.extract %216[2, 15, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1742 = vector.insert %1741, %1740 [0, 2, 10, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1743 = vector.extract %216[2, 15, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1744 = vector.insert %1743, %1742 [0, 2, 11, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1745 = vector.extract %216[2, 15, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1746 = vector.insert %1745, %1744 [0, 2, 12, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1747 = vector.extract %216[2, 15, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1748 = vector.insert %1747, %1746 [0, 2, 13, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1749 = vector.extract %216[2, 15, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1750 = vector.insert %1749, %1748 [0, 2, 14, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1751 = vector.extract %216[2, 15, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1752 = vector.insert %1751, %1750 [0, 2, 15, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1753 = vector.extract %216[3, 0, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1754 = vector.insert %1753, %1752 [0, 3, 0, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1755 = vector.extract %216[3, 0, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1756 = vector.insert %1755, %1754 [0, 3, 1, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1757 = vector.extract %216[3, 0, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1758 = vector.insert %1757, %1756 [0, 3, 2, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1759 = vector.extract %216[3, 0, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1760 = vector.insert %1759, %1758 [0, 3, 3, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1761 = vector.extract %216[3, 0, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1762 = vector.insert %1761, %1760 [0, 3, 4, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1763 = vector.extract %216[3, 0, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1764 = vector.insert %1763, %1762 [0, 3, 5, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1765 = vector.extract %216[3, 0, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1766 = vector.insert %1765, %1764 [0, 3, 6, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1767 = vector.extract %216[3, 0, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1768 = vector.insert %1767, %1766 [0, 3, 7, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1769 = vector.extract %216[3, 0, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1770 = vector.insert %1769, %1768 [0, 3, 8, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1771 = vector.extract %216[3, 0, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1772 = vector.insert %1771, %1770 [0, 3, 9, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1773 = vector.extract %216[3, 0, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1774 = vector.insert %1773, %1772 [0, 3, 10, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1775 = vector.extract %216[3, 0, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1776 = vector.insert %1775, %1774 [0, 3, 11, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1777 = vector.extract %216[3, 0, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1778 = vector.insert %1777, %1776 [0, 3, 12, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1779 = vector.extract %216[3, 0, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1780 = vector.insert %1779, %1778 [0, 3, 13, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1781 = vector.extract %216[3, 0, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1782 = vector.insert %1781, %1780 [0, 3, 14, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1783 = vector.extract %216[3, 0, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1784 = vector.insert %1783, %1782 [0, 3, 15, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1785 = vector.extract %216[3, 1, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1786 = vector.insert %1785, %1784 [0, 3, 0, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1787 = vector.extract %216[3, 1, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1788 = vector.insert %1787, %1786 [0, 3, 1, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1789 = vector.extract %216[3, 1, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1790 = vector.insert %1789, %1788 [0, 3, 2, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1791 = vector.extract %216[3, 1, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1792 = vector.insert %1791, %1790 [0, 3, 3, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1793 = vector.extract %216[3, 1, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1794 = vector.insert %1793, %1792 [0, 3, 4, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1795 = vector.extract %216[3, 1, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1796 = vector.insert %1795, %1794 [0, 3, 5, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1797 = vector.extract %216[3, 1, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1798 = vector.insert %1797, %1796 [0, 3, 6, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1799 = vector.extract %216[3, 1, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1800 = vector.insert %1799, %1798 [0, 3, 7, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1801 = vector.extract %216[3, 1, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1802 = vector.insert %1801, %1800 [0, 3, 8, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1803 = vector.extract %216[3, 1, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1804 = vector.insert %1803, %1802 [0, 3, 9, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1805 = vector.extract %216[3, 1, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1806 = vector.insert %1805, %1804 [0, 3, 10, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1807 = vector.extract %216[3, 1, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1808 = vector.insert %1807, %1806 [0, 3, 11, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1809 = vector.extract %216[3, 1, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1810 = vector.insert %1809, %1808 [0, 3, 12, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1811 = vector.extract %216[3, 1, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1812 = vector.insert %1811, %1810 [0, 3, 13, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1813 = vector.extract %216[3, 1, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1814 = vector.insert %1813, %1812 [0, 3, 14, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1815 = vector.extract %216[3, 1, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1816 = vector.insert %1815, %1814 [0, 3, 15, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1817 = vector.extract %216[3, 2, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1818 = vector.insert %1817, %1816 [0, 3, 0, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1819 = vector.extract %216[3, 2, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1820 = vector.insert %1819, %1818 [0, 3, 1, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1821 = vector.extract %216[3, 2, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1822 = vector.insert %1821, %1820 [0, 3, 2, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1823 = vector.extract %216[3, 2, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1824 = vector.insert %1823, %1822 [0, 3, 3, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1825 = vector.extract %216[3, 2, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1826 = vector.insert %1825, %1824 [0, 3, 4, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1827 = vector.extract %216[3, 2, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1828 = vector.insert %1827, %1826 [0, 3, 5, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1829 = vector.extract %216[3, 2, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1830 = vector.insert %1829, %1828 [0, 3, 6, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1831 = vector.extract %216[3, 2, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1832 = vector.insert %1831, %1830 [0, 3, 7, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1833 = vector.extract %216[3, 2, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1834 = vector.insert %1833, %1832 [0, 3, 8, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1835 = vector.extract %216[3, 2, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1836 = vector.insert %1835, %1834 [0, 3, 9, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1837 = vector.extract %216[3, 2, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1838 = vector.insert %1837, %1836 [0, 3, 10, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1839 = vector.extract %216[3, 2, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1840 = vector.insert %1839, %1838 [0, 3, 11, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1841 = vector.extract %216[3, 2, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1842 = vector.insert %1841, %1840 [0, 3, 12, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1843 = vector.extract %216[3, 2, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1844 = vector.insert %1843, %1842 [0, 3, 13, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1845 = vector.extract %216[3, 2, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1846 = vector.insert %1845, %1844 [0, 3, 14, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1847 = vector.extract %216[3, 2, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1848 = vector.insert %1847, %1846 [0, 3, 15, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1849 = vector.extract %216[3, 3, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1850 = vector.insert %1849, %1848 [0, 3, 0, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1851 = vector.extract %216[3, 3, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1852 = vector.insert %1851, %1850 [0, 3, 1, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1853 = vector.extract %216[3, 3, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1854 = vector.insert %1853, %1852 [0, 3, 2, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1855 = vector.extract %216[3, 3, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1856 = vector.insert %1855, %1854 [0, 3, 3, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1857 = vector.extract %216[3, 3, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1858 = vector.insert %1857, %1856 [0, 3, 4, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1859 = vector.extract %216[3, 3, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1860 = vector.insert %1859, %1858 [0, 3, 5, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1861 = vector.extract %216[3, 3, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1862 = vector.insert %1861, %1860 [0, 3, 6, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1863 = vector.extract %216[3, 3, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1864 = vector.insert %1863, %1862 [0, 3, 7, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1865 = vector.extract %216[3, 3, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1866 = vector.insert %1865, %1864 [0, 3, 8, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1867 = vector.extract %216[3, 3, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1868 = vector.insert %1867, %1866 [0, 3, 9, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1869 = vector.extract %216[3, 3, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1870 = vector.insert %1869, %1868 [0, 3, 10, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1871 = vector.extract %216[3, 3, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1872 = vector.insert %1871, %1870 [0, 3, 11, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1873 = vector.extract %216[3, 3, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1874 = vector.insert %1873, %1872 [0, 3, 12, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1875 = vector.extract %216[3, 3, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1876 = vector.insert %1875, %1874 [0, 3, 13, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1877 = vector.extract %216[3, 3, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1878 = vector.insert %1877, %1876 [0, 3, 14, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1879 = vector.extract %216[3, 3, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1880 = vector.insert %1879, %1878 [0, 3, 15, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1881 = vector.extract %216[3, 4, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1882 = vector.insert %1881, %1880 [0, 3, 0, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1883 = vector.extract %216[3, 4, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1884 = vector.insert %1883, %1882 [0, 3, 1, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1885 = vector.extract %216[3, 4, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1886 = vector.insert %1885, %1884 [0, 3, 2, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1887 = vector.extract %216[3, 4, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1888 = vector.insert %1887, %1886 [0, 3, 3, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1889 = vector.extract %216[3, 4, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1890 = vector.insert %1889, %1888 [0, 3, 4, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1891 = vector.extract %216[3, 4, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1892 = vector.insert %1891, %1890 [0, 3, 5, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1893 = vector.extract %216[3, 4, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1894 = vector.insert %1893, %1892 [0, 3, 6, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1895 = vector.extract %216[3, 4, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1896 = vector.insert %1895, %1894 [0, 3, 7, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1897 = vector.extract %216[3, 4, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1898 = vector.insert %1897, %1896 [0, 3, 8, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1899 = vector.extract %216[3, 4, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1900 = vector.insert %1899, %1898 [0, 3, 9, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1901 = vector.extract %216[3, 4, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1902 = vector.insert %1901, %1900 [0, 3, 10, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1903 = vector.extract %216[3, 4, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1904 = vector.insert %1903, %1902 [0, 3, 11, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1905 = vector.extract %216[3, 4, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1906 = vector.insert %1905, %1904 [0, 3, 12, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1907 = vector.extract %216[3, 4, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1908 = vector.insert %1907, %1906 [0, 3, 13, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1909 = vector.extract %216[3, 4, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1910 = vector.insert %1909, %1908 [0, 3, 14, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1911 = vector.extract %216[3, 4, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1912 = vector.insert %1911, %1910 [0, 3, 15, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1913 = vector.extract %216[3, 5, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1914 = vector.insert %1913, %1912 [0, 3, 0, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1915 = vector.extract %216[3, 5, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1916 = vector.insert %1915, %1914 [0, 3, 1, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1917 = vector.extract %216[3, 5, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1918 = vector.insert %1917, %1916 [0, 3, 2, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1919 = vector.extract %216[3, 5, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1920 = vector.insert %1919, %1918 [0, 3, 3, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1921 = vector.extract %216[3, 5, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1922 = vector.insert %1921, %1920 [0, 3, 4, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1923 = vector.extract %216[3, 5, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1924 = vector.insert %1923, %1922 [0, 3, 5, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1925 = vector.extract %216[3, 5, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1926 = vector.insert %1925, %1924 [0, 3, 6, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1927 = vector.extract %216[3, 5, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1928 = vector.insert %1927, %1926 [0, 3, 7, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1929 = vector.extract %216[3, 5, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1930 = vector.insert %1929, %1928 [0, 3, 8, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1931 = vector.extract %216[3, 5, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1932 = vector.insert %1931, %1930 [0, 3, 9, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1933 = vector.extract %216[3, 5, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1934 = vector.insert %1933, %1932 [0, 3, 10, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1935 = vector.extract %216[3, 5, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1936 = vector.insert %1935, %1934 [0, 3, 11, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1937 = vector.extract %216[3, 5, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1938 = vector.insert %1937, %1936 [0, 3, 12, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1939 = vector.extract %216[3, 5, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1940 = vector.insert %1939, %1938 [0, 3, 13, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1941 = vector.extract %216[3, 5, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1942 = vector.insert %1941, %1940 [0, 3, 14, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1943 = vector.extract %216[3, 5, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1944 = vector.insert %1943, %1942 [0, 3, 15, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1945 = vector.extract %216[3, 6, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1946 = vector.insert %1945, %1944 [0, 3, 0, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1947 = vector.extract %216[3, 6, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1948 = vector.insert %1947, %1946 [0, 3, 1, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1949 = vector.extract %216[3, 6, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1950 = vector.insert %1949, %1948 [0, 3, 2, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1951 = vector.extract %216[3, 6, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1952 = vector.insert %1951, %1950 [0, 3, 3, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1953 = vector.extract %216[3, 6, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1954 = vector.insert %1953, %1952 [0, 3, 4, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1955 = vector.extract %216[3, 6, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1956 = vector.insert %1955, %1954 [0, 3, 5, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1957 = vector.extract %216[3, 6, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1958 = vector.insert %1957, %1956 [0, 3, 6, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1959 = vector.extract %216[3, 6, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1960 = vector.insert %1959, %1958 [0, 3, 7, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1961 = vector.extract %216[3, 6, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1962 = vector.insert %1961, %1960 [0, 3, 8, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1963 = vector.extract %216[3, 6, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1964 = vector.insert %1963, %1962 [0, 3, 9, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1965 = vector.extract %216[3, 6, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1966 = vector.insert %1965, %1964 [0, 3, 10, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1967 = vector.extract %216[3, 6, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1968 = vector.insert %1967, %1966 [0, 3, 11, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1969 = vector.extract %216[3, 6, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1970 = vector.insert %1969, %1968 [0, 3, 12, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1971 = vector.extract %216[3, 6, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1972 = vector.insert %1971, %1970 [0, 3, 13, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1973 = vector.extract %216[3, 6, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1974 = vector.insert %1973, %1972 [0, 3, 14, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1975 = vector.extract %216[3, 6, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1976 = vector.insert %1975, %1974 [0, 3, 15, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1977 = vector.extract %216[3, 7, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1978 = vector.insert %1977, %1976 [0, 3, 0, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1979 = vector.extract %216[3, 7, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1980 = vector.insert %1979, %1978 [0, 3, 1, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1981 = vector.extract %216[3, 7, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1982 = vector.insert %1981, %1980 [0, 3, 2, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1983 = vector.extract %216[3, 7, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1984 = vector.insert %1983, %1982 [0, 3, 3, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1985 = vector.extract %216[3, 7, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1986 = vector.insert %1985, %1984 [0, 3, 4, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1987 = vector.extract %216[3, 7, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1988 = vector.insert %1987, %1986 [0, 3, 5, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1989 = vector.extract %216[3, 7, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1990 = vector.insert %1989, %1988 [0, 3, 6, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1991 = vector.extract %216[3, 7, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1992 = vector.insert %1991, %1990 [0, 3, 7, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1993 = vector.extract %216[3, 7, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1994 = vector.insert %1993, %1992 [0, 3, 8, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1995 = vector.extract %216[3, 7, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1996 = vector.insert %1995, %1994 [0, 3, 9, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1997 = vector.extract %216[3, 7, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1998 = vector.insert %1997, %1996 [0, 3, 10, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1999 = vector.extract %216[3, 7, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2000 = vector.insert %1999, %1998 [0, 3, 11, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2001 = vector.extract %216[3, 7, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2002 = vector.insert %2001, %2000 [0, 3, 12, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2003 = vector.extract %216[3, 7, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2004 = vector.insert %2003, %2002 [0, 3, 13, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2005 = vector.extract %216[3, 7, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2006 = vector.insert %2005, %2004 [0, 3, 14, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2007 = vector.extract %216[3, 7, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2008 = vector.insert %2007, %2006 [0, 3, 15, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2009 = vector.extract %216[3, 8, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2010 = vector.insert %2009, %2008 [0, 3, 0, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2011 = vector.extract %216[3, 8, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2012 = vector.insert %2011, %2010 [0, 3, 1, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2013 = vector.extract %216[3, 8, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2014 = vector.insert %2013, %2012 [0, 3, 2, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2015 = vector.extract %216[3, 8, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2016 = vector.insert %2015, %2014 [0, 3, 3, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2017 = vector.extract %216[3, 8, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2018 = vector.insert %2017, %2016 [0, 3, 4, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2019 = vector.extract %216[3, 8, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2020 = vector.insert %2019, %2018 [0, 3, 5, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2021 = vector.extract %216[3, 8, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2022 = vector.insert %2021, %2020 [0, 3, 6, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2023 = vector.extract %216[3, 8, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2024 = vector.insert %2023, %2022 [0, 3, 7, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2025 = vector.extract %216[3, 8, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2026 = vector.insert %2025, %2024 [0, 3, 8, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2027 = vector.extract %216[3, 8, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2028 = vector.insert %2027, %2026 [0, 3, 9, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2029 = vector.extract %216[3, 8, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2030 = vector.insert %2029, %2028 [0, 3, 10, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2031 = vector.extract %216[3, 8, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2032 = vector.insert %2031, %2030 [0, 3, 11, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2033 = vector.extract %216[3, 8, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2034 = vector.insert %2033, %2032 [0, 3, 12, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2035 = vector.extract %216[3, 8, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2036 = vector.insert %2035, %2034 [0, 3, 13, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2037 = vector.extract %216[3, 8, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2038 = vector.insert %2037, %2036 [0, 3, 14, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2039 = vector.extract %216[3, 8, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2040 = vector.insert %2039, %2038 [0, 3, 15, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2041 = vector.extract %216[3, 9, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2042 = vector.insert %2041, %2040 [0, 3, 0, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2043 = vector.extract %216[3, 9, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2044 = vector.insert %2043, %2042 [0, 3, 1, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2045 = vector.extract %216[3, 9, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2046 = vector.insert %2045, %2044 [0, 3, 2, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2047 = vector.extract %216[3, 9, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2048 = vector.insert %2047, %2046 [0, 3, 3, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2049 = vector.extract %216[3, 9, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2050 = vector.insert %2049, %2048 [0, 3, 4, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2051 = vector.extract %216[3, 9, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2052 = vector.insert %2051, %2050 [0, 3, 5, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2053 = vector.extract %216[3, 9, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2054 = vector.insert %2053, %2052 [0, 3, 6, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2055 = vector.extract %216[3, 9, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2056 = vector.insert %2055, %2054 [0, 3, 7, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2057 = vector.extract %216[3, 9, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2058 = vector.insert %2057, %2056 [0, 3, 8, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2059 = vector.extract %216[3, 9, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2060 = vector.insert %2059, %2058 [0, 3, 9, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2061 = vector.extract %216[3, 9, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2062 = vector.insert %2061, %2060 [0, 3, 10, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2063 = vector.extract %216[3, 9, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2064 = vector.insert %2063, %2062 [0, 3, 11, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2065 = vector.extract %216[3, 9, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2066 = vector.insert %2065, %2064 [0, 3, 12, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2067 = vector.extract %216[3, 9, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2068 = vector.insert %2067, %2066 [0, 3, 13, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2069 = vector.extract %216[3, 9, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2070 = vector.insert %2069, %2068 [0, 3, 14, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2071 = vector.extract %216[3, 9, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2072 = vector.insert %2071, %2070 [0, 3, 15, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2073 = vector.extract %216[3, 10, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2074 = vector.insert %2073, %2072 [0, 3, 0, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2075 = vector.extract %216[3, 10, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2076 = vector.insert %2075, %2074 [0, 3, 1, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2077 = vector.extract %216[3, 10, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2078 = vector.insert %2077, %2076 [0, 3, 2, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2079 = vector.extract %216[3, 10, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2080 = vector.insert %2079, %2078 [0, 3, 3, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2081 = vector.extract %216[3, 10, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2082 = vector.insert %2081, %2080 [0, 3, 4, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2083 = vector.extract %216[3, 10, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2084 = vector.insert %2083, %2082 [0, 3, 5, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2085 = vector.extract %216[3, 10, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2086 = vector.insert %2085, %2084 [0, 3, 6, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2087 = vector.extract %216[3, 10, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2088 = vector.insert %2087, %2086 [0, 3, 7, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2089 = vector.extract %216[3, 10, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2090 = vector.insert %2089, %2088 [0, 3, 8, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2091 = vector.extract %216[3, 10, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2092 = vector.insert %2091, %2090 [0, 3, 9, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2093 = vector.extract %216[3, 10, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2094 = vector.insert %2093, %2092 [0, 3, 10, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2095 = vector.extract %216[3, 10, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2096 = vector.insert %2095, %2094 [0, 3, 11, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2097 = vector.extract %216[3, 10, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2098 = vector.insert %2097, %2096 [0, 3, 12, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2099 = vector.extract %216[3, 10, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2100 = vector.insert %2099, %2098 [0, 3, 13, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2101 = vector.extract %216[3, 10, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2102 = vector.insert %2101, %2100 [0, 3, 14, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2103 = vector.extract %216[3, 10, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2104 = vector.insert %2103, %2102 [0, 3, 15, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2105 = vector.extract %216[3, 11, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2106 = vector.insert %2105, %2104 [0, 3, 0, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2107 = vector.extract %216[3, 11, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2108 = vector.insert %2107, %2106 [0, 3, 1, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2109 = vector.extract %216[3, 11, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2110 = vector.insert %2109, %2108 [0, 3, 2, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2111 = vector.extract %216[3, 11, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2112 = vector.insert %2111, %2110 [0, 3, 3, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2113 = vector.extract %216[3, 11, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2114 = vector.insert %2113, %2112 [0, 3, 4, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2115 = vector.extract %216[3, 11, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2116 = vector.insert %2115, %2114 [0, 3, 5, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2117 = vector.extract %216[3, 11, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2118 = vector.insert %2117, %2116 [0, 3, 6, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2119 = vector.extract %216[3, 11, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2120 = vector.insert %2119, %2118 [0, 3, 7, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2121 = vector.extract %216[3, 11, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2122 = vector.insert %2121, %2120 [0, 3, 8, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2123 = vector.extract %216[3, 11, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2124 = vector.insert %2123, %2122 [0, 3, 9, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2125 = vector.extract %216[3, 11, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2126 = vector.insert %2125, %2124 [0, 3, 10, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2127 = vector.extract %216[3, 11, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2128 = vector.insert %2127, %2126 [0, 3, 11, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2129 = vector.extract %216[3, 11, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2130 = vector.insert %2129, %2128 [0, 3, 12, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2131 = vector.extract %216[3, 11, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2132 = vector.insert %2131, %2130 [0, 3, 13, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2133 = vector.extract %216[3, 11, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2134 = vector.insert %2133, %2132 [0, 3, 14, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2135 = vector.extract %216[3, 11, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2136 = vector.insert %2135, %2134 [0, 3, 15, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2137 = vector.extract %216[3, 12, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2138 = vector.insert %2137, %2136 [0, 3, 0, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2139 = vector.extract %216[3, 12, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2140 = vector.insert %2139, %2138 [0, 3, 1, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2141 = vector.extract %216[3, 12, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2142 = vector.insert %2141, %2140 [0, 3, 2, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2143 = vector.extract %216[3, 12, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2144 = vector.insert %2143, %2142 [0, 3, 3, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2145 = vector.extract %216[3, 12, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2146 = vector.insert %2145, %2144 [0, 3, 4, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2147 = vector.extract %216[3, 12, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2148 = vector.insert %2147, %2146 [0, 3, 5, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2149 = vector.extract %216[3, 12, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2150 = vector.insert %2149, %2148 [0, 3, 6, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2151 = vector.extract %216[3, 12, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2152 = vector.insert %2151, %2150 [0, 3, 7, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2153 = vector.extract %216[3, 12, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2154 = vector.insert %2153, %2152 [0, 3, 8, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2155 = vector.extract %216[3, 12, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2156 = vector.insert %2155, %2154 [0, 3, 9, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2157 = vector.extract %216[3, 12, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2158 = vector.insert %2157, %2156 [0, 3, 10, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2159 = vector.extract %216[3, 12, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2160 = vector.insert %2159, %2158 [0, 3, 11, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2161 = vector.extract %216[3, 12, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2162 = vector.insert %2161, %2160 [0, 3, 12, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2163 = vector.extract %216[3, 12, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2164 = vector.insert %2163, %2162 [0, 3, 13, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2165 = vector.extract %216[3, 12, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2166 = vector.insert %2165, %2164 [0, 3, 14, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2167 = vector.extract %216[3, 12, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2168 = vector.insert %2167, %2166 [0, 3, 15, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2169 = vector.extract %216[3, 13, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2170 = vector.insert %2169, %2168 [0, 3, 0, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2171 = vector.extract %216[3, 13, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2172 = vector.insert %2171, %2170 [0, 3, 1, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2173 = vector.extract %216[3, 13, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2174 = vector.insert %2173, %2172 [0, 3, 2, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2175 = vector.extract %216[3, 13, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2176 = vector.insert %2175, %2174 [0, 3, 3, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2177 = vector.extract %216[3, 13, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2178 = vector.insert %2177, %2176 [0, 3, 4, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2179 = vector.extract %216[3, 13, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2180 = vector.insert %2179, %2178 [0, 3, 5, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2181 = vector.extract %216[3, 13, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2182 = vector.insert %2181, %2180 [0, 3, 6, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2183 = vector.extract %216[3, 13, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2184 = vector.insert %2183, %2182 [0, 3, 7, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2185 = vector.extract %216[3, 13, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2186 = vector.insert %2185, %2184 [0, 3, 8, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2187 = vector.extract %216[3, 13, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2188 = vector.insert %2187, %2186 [0, 3, 9, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2189 = vector.extract %216[3, 13, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2190 = vector.insert %2189, %2188 [0, 3, 10, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2191 = vector.extract %216[3, 13, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2192 = vector.insert %2191, %2190 [0, 3, 11, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2193 = vector.extract %216[3, 13, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2194 = vector.insert %2193, %2192 [0, 3, 12, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2195 = vector.extract %216[3, 13, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2196 = vector.insert %2195, %2194 [0, 3, 13, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2197 = vector.extract %216[3, 13, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2198 = vector.insert %2197, %2196 [0, 3, 14, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2199 = vector.extract %216[3, 13, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2200 = vector.insert %2199, %2198 [0, 3, 15, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2201 = vector.extract %216[3, 14, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2202 = vector.insert %2201, %2200 [0, 3, 0, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2203 = vector.extract %216[3, 14, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2204 = vector.insert %2203, %2202 [0, 3, 1, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2205 = vector.extract %216[3, 14, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2206 = vector.insert %2205, %2204 [0, 3, 2, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2207 = vector.extract %216[3, 14, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2208 = vector.insert %2207, %2206 [0, 3, 3, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2209 = vector.extract %216[3, 14, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2210 = vector.insert %2209, %2208 [0, 3, 4, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2211 = vector.extract %216[3, 14, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2212 = vector.insert %2211, %2210 [0, 3, 5, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2213 = vector.extract %216[3, 14, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2214 = vector.insert %2213, %2212 [0, 3, 6, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2215 = vector.extract %216[3, 14, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2216 = vector.insert %2215, %2214 [0, 3, 7, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2217 = vector.extract %216[3, 14, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2218 = vector.insert %2217, %2216 [0, 3, 8, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2219 = vector.extract %216[3, 14, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2220 = vector.insert %2219, %2218 [0, 3, 9, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2221 = vector.extract %216[3, 14, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2222 = vector.insert %2221, %2220 [0, 3, 10, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2223 = vector.extract %216[3, 14, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2224 = vector.insert %2223, %2222 [0, 3, 11, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2225 = vector.extract %216[3, 14, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2226 = vector.insert %2225, %2224 [0, 3, 12, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2227 = vector.extract %216[3, 14, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2228 = vector.insert %2227, %2226 [0, 3, 13, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2229 = vector.extract %216[3, 14, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2230 = vector.insert %2229, %2228 [0, 3, 14, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2231 = vector.extract %216[3, 14, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2232 = vector.insert %2231, %2230 [0, 3, 15, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2233 = vector.extract %216[3, 15, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2234 = vector.insert %2233, %2232 [0, 3, 0, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2235 = vector.extract %216[3, 15, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2236 = vector.insert %2235, %2234 [0, 3, 1, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2237 = vector.extract %216[3, 15, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2238 = vector.insert %2237, %2236 [0, 3, 2, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2239 = vector.extract %216[3, 15, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2240 = vector.insert %2239, %2238 [0, 3, 3, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2241 = vector.extract %216[3, 15, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2242 = vector.insert %2241, %2240 [0, 3, 4, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2243 = vector.extract %216[3, 15, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2244 = vector.insert %2243, %2242 [0, 3, 5, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2245 = vector.extract %216[3, 15, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2246 = vector.insert %2245, %2244 [0, 3, 6, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2247 = vector.extract %216[3, 15, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2248 = vector.insert %2247, %2246 [0, 3, 7, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2249 = vector.extract %216[3, 15, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2250 = vector.insert %2249, %2248 [0, 3, 8, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2251 = vector.extract %216[3, 15, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2252 = vector.insert %2251, %2250 [0, 3, 9, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2253 = vector.extract %216[3, 15, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2254 = vector.insert %2253, %2252 [0, 3, 10, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2255 = vector.extract %216[3, 15, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2256 = vector.insert %2255, %2254 [0, 3, 11, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2257 = vector.extract %216[3, 15, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2258 = vector.insert %2257, %2256 [0, 3, 12, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2259 = vector.extract %216[3, 15, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2260 = vector.insert %2259, %2258 [0, 3, 13, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2261 = vector.extract %216[3, 15, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2262 = vector.insert %2261, %2260 [0, 3, 14, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2263 = vector.extract %216[3, 15, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2264 = vector.insert %2263, %2262 [0, 3, 15, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2265 = vector.extract %2264[0] : vector<4x16x16x1xf16> from vector<1x4x16x16x1xf16>
%subview_5 = memref.subview %subview[0, 0, 0, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%2266 = vector.shape_cast %2265 : vector<4x16x16x1xf16> to vector<4x16x16xf16>
%2267 = vector.extract %2266[0, 0] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2267, %subview_5[%arg3, %c0, %arg4, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2268 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
%2269 = vector.extract %2266[0, 1] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2269, %subview_5[%arg3, %c0, %2268, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2270 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
%2271 = vector.extract %2266[0, 2] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2271, %subview_5[%arg3, %c0, %2270, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2272 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg4)
%2273 = vector.extract %2266[0, 3] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2273, %subview_5[%arg3, %c0, %2272, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2274 = affine.apply affine_map<(d0) -> (d0 + 4)>(%arg4)
%2275 = vector.extract %2266[0, 4] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2275, %subview_5[%arg3, %c0, %2274, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2276 = affine.apply affine_map<(d0) -> (d0 + 5)>(%arg4)
%2277 = vector.extract %2266[0, 5] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2277, %subview_5[%arg3, %c0, %2276, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2278 = affine.apply affine_map<(d0) -> (d0 + 6)>(%arg4)
%2279 = vector.extract %2266[0, 6] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2279, %subview_5[%arg3, %c0, %2278, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2280 = affine.apply affine_map<(d0) -> (d0 + 7)>(%arg4)
%2281 = vector.extract %2266[0, 7] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2281, %subview_5[%arg3, %c0, %2280, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2282 = affine.apply affine_map<(d0) -> (d0 + 8)>(%arg4)
%2283 = vector.extract %2266[0, 8] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2283, %subview_5[%arg3, %c0, %2282, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2284 = affine.apply affine_map<(d0) -> (d0 + 9)>(%arg4)
%2285 = vector.extract %2266[0, 9] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2285, %subview_5[%arg3, %c0, %2284, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2286 = affine.apply affine_map<(d0) -> (d0 + 10)>(%arg4)
%2287 = vector.extract %2266[0, 10] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2287, %subview_5[%arg3, %c0, %2286, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2288 = affine.apply affine_map<(d0) -> (d0 + 11)>(%arg4)
%2289 = vector.extract %2266[0, 11] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2289, %subview_5[%arg3, %c0, %2288, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2290 = affine.apply affine_map<(d0) -> (d0 + 12)>(%arg4)
%2291 = vector.extract %2266[0, 12] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2291, %subview_5[%arg3, %c0, %2290, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2292 = affine.apply affine_map<(d0) -> (d0 + 13)>(%arg4)
%2293 = vector.extract %2266[0, 13] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2293, %subview_5[%arg3, %c0, %2292, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2294 = affine.apply affine_map<(d0) -> (d0 + 14)>(%arg4)
%2295 = vector.extract %2266[0, 14] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2295, %subview_5[%arg3, %c0, %2294, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2296 = affine.apply affine_map<(d0) -> (d0 + 15)>(%arg4)
%2297 = vector.extract %2266[0, 15] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2297, %subview_5[%arg3, %c0, %2296, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2298 = vector.extract %2266[1, 0] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2298, %subview_5[%arg3, %c1, %arg4, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2299 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
%2300 = vector.extract %2266[1, 1] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2300, %subview_5[%arg3, %c1, %2299, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2301 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
%2302 = vector.extract %2266[1, 2] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2302, %subview_5[%arg3, %c1, %2301, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2303 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg4)
%2304 = vector.extract %2266[1, 3] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2304, %subview_5[%arg3, %c1, %2303, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2305 = affine.apply affine_map<(d0) -> (d0 + 4)>(%arg4)
%2306 = vector.extract %2266[1, 4] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2306, %subview_5[%arg3, %c1, %2305, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2307 = affine.apply affine_map<(d0) -> (d0 + 5)>(%arg4)
%2308 = vector.extract %2266[1, 5] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2308, %subview_5[%arg3, %c1, %2307, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2309 = affine.apply affine_map<(d0) -> (d0 + 6)>(%arg4)
%2310 = vector.extract %2266[1, 6] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2310, %subview_5[%arg3, %c1, %2309, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2311 = affine.apply affine_map<(d0) -> (d0 + 7)>(%arg4)
%2312 = vector.extract %2266[1, 7] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2312, %subview_5[%arg3, %c1, %2311, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2313 = affine.apply affine_map<(d0) -> (d0 + 8)>(%arg4)
%2314 = vector.extract %2266[1, 8] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2314, %subview_5[%arg3, %c1, %2313, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2315 = affine.apply affine_map<(d0) -> (d0 + 9)>(%arg4)
%2316 = vector.extract %2266[1, 9] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2316, %subview_5[%arg3, %c1, %2315, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2317 = affine.apply affine_map<(d0) -> (d0 + 10)>(%arg4)
%2318 = vector.extract %2266[1, 10] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2318, %subview_5[%arg3, %c1, %2317, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2319 = affine.apply affine_map<(d0) -> (d0 + 11)>(%arg4)
%2320 = vector.extract %2266[1, 11] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2320, %subview_5[%arg3, %c1, %2319, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2321 = affine.apply affine_map<(d0) -> (d0 + 12)>(%arg4)
%2322 = vector.extract %2266[1, 12] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2322, %subview_5[%arg3, %c1, %2321, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2323 = affine.apply affine_map<(d0) -> (d0 + 13)>(%arg4)
%2324 = vector.extract %2266[1, 13] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2324, %subview_5[%arg3, %c1, %2323, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2325 = affine.apply affine_map<(d0) -> (d0 + 14)>(%arg4)
%2326 = vector.extract %2266[1, 14] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2326, %subview_5[%arg3, %c1, %2325, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2327 = affine.apply affine_map<(d0) -> (d0 + 15)>(%arg4)
%2328 = vector.extract %2266[1, 15] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2328, %subview_5[%arg3, %c1, %2327, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2329 = vector.extract %2266[2, 0] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2329, %subview_5[%arg3, %c2, %arg4, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2330 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
%2331 = vector.extract %2266[2, 1] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2331, %subview_5[%arg3, %c2, %2330, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2332 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
%2333 = vector.extract %2266[2, 2] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2333, %subview_5[%arg3, %c2, %2332, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2334 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg4)
%2335 = vector.extract %2266[2, 3] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2335, %subview_5[%arg3, %c2, %2334, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2336 = affine.apply affine_map<(d0) -> (d0 + 4)>(%arg4)
%2337 = vector.extract %2266[2, 4] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2337, %subview_5[%arg3, %c2, %2336, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2338 = affine.apply affine_map<(d0) -> (d0 + 5)>(%arg4)
%2339 = vector.extract %2266[2, 5] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2339, %subview_5[%arg3, %c2, %2338, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2340 = affine.apply affine_map<(d0) -> (d0 + 6)>(%arg4)
%2341 = vector.extract %2266[2, 6] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2341, %subview_5[%arg3, %c2, %2340, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2342 = affine.apply affine_map<(d0) -> (d0 + 7)>(%arg4)
%2343 = vector.extract %2266[2, 7] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2343, %subview_5[%arg3, %c2, %2342, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2344 = affine.apply affine_map<(d0) -> (d0 + 8)>(%arg4)
%2345 = vector.extract %2266[2, 8] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2345, %subview_5[%arg3, %c2, %2344, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2346 = affine.apply affine_map<(d0) -> (d0 + 9)>(%arg4)
%2347 = vector.extract %2266[2, 9] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2347, %subview_5[%arg3, %c2, %2346, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2348 = affine.apply affine_map<(d0) -> (d0 + 10)>(%arg4)
%2349 = vector.extract %2266[2, 10] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2349, %subview_5[%arg3, %c2, %2348, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2350 = affine.apply affine_map<(d0) -> (d0 + 11)>(%arg4)
%2351 = vector.extract %2266[2, 11] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2351, %subview_5[%arg3, %c2, %2350, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2352 = affine.apply affine_map<(d0) -> (d0 + 12)>(%arg4)
%2353 = vector.extract %2266[2, 12] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2353, %subview_5[%arg3, %c2, %2352, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2354 = affine.apply affine_map<(d0) -> (d0 + 13)>(%arg4)
%2355 = vector.extract %2266[2, 13] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2355, %subview_5[%arg3, %c2, %2354, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2356 = affine.apply affine_map<(d0) -> (d0 + 14)>(%arg4)
%2357 = vector.extract %2266[2, 14] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2357, %subview_5[%arg3, %c2, %2356, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2358 = affine.apply affine_map<(d0) -> (d0 + 15)>(%arg4)
%2359 = vector.extract %2266[2, 15] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2359, %subview_5[%arg3, %c2, %2358, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2360 = vector.extract %2266[3, 0] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2360, %subview_5[%arg3, %c3, %arg4, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2361 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
%2362 = vector.extract %2266[3, 1] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2362, %subview_5[%arg3, %c3, %2361, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2363 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
%2364 = vector.extract %2266[3, 2] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2364, %subview_5[%arg3, %c3, %2363, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2365 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg4)
%2366 = vector.extract %2266[3, 3] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2366, %subview_5[%arg3, %c3, %2365, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2367 = affine.apply affine_map<(d0) -> (d0 + 4)>(%arg4)
%2368 = vector.extract %2266[3, 4] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2368, %subview_5[%arg3, %c3, %2367, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2369 = affine.apply affine_map<(d0) -> (d0 + 5)>(%arg4)
%2370 = vector.extract %2266[3, 5] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2370, %subview_5[%arg3, %c3, %2369, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2371 = affine.apply affine_map<(d0) -> (d0 + 6)>(%arg4)
%2372 = vector.extract %2266[3, 6] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2372, %subview_5[%arg3, %c3, %2371, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2373 = affine.apply affine_map<(d0) -> (d0 + 7)>(%arg4)
%2374 = vector.extract %2266[3, 7] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2374, %subview_5[%arg3, %c3, %2373, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2375 = affine.apply affine_map<(d0) -> (d0 + 8)>(%arg4)
%2376 = vector.extract %2266[3, 8] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2376, %subview_5[%arg3, %c3, %2375, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2377 = affine.apply affine_map<(d0) -> (d0 + 9)>(%arg4)
%2378 = vector.extract %2266[3, 9] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2378, %subview_5[%arg3, %c3, %2377, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2379 = affine.apply affine_map<(d0) -> (d0 + 10)>(%arg4)
%2380 = vector.extract %2266[3, 10] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2380, %subview_5[%arg3, %c3, %2379, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2381 = affine.apply affine_map<(d0) -> (d0 + 11)>(%arg4)
%2382 = vector.extract %2266[3, 11] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2382, %subview_5[%arg3, %c3, %2381, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2383 = affine.apply affine_map<(d0) -> (d0 + 12)>(%arg4)
%2384 = vector.extract %2266[3, 12] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2384, %subview_5[%arg3, %c3, %2383, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2385 = affine.apply affine_map<(d0) -> (d0 + 13)>(%arg4)
%2386 = vector.extract %2266[3, 13] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2386, %subview_5[%arg3, %c3, %2385, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2387 = affine.apply affine_map<(d0) -> (d0 + 14)>(%arg4)
%2388 = vector.extract %2266[3, 14] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2388, %subview_5[%arg3, %c3, %2387, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2389 = affine.apply affine_map<(d0) -> (d0 + 15)>(%arg4)
%2390 = vector.extract %2266[3, 15] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2390, %subview_5[%arg3, %c3, %2389, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
}
}
}
}
}
return
}
// -----// IR Dump After Canonicalizer (canonicalize) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%cst = arith.constant dense<0.000000e+00> : vector<1x4x16x16x1xf16>
%cst_0 = arith.constant dense<0.000000e+00> : vector<4x16x16xf16>
%c63 = arith.constant 63 : index
%c62 = arith.constant 62 : index
%c61 = arith.constant 61 : index
%c60 = arith.constant 60 : index
%c59 = arith.constant 59 : index
%c58 = arith.constant 58 : index
%c57 = arith.constant 57 : index
%c56 = arith.constant 56 : index
%c55 = arith.constant 55 : index
%c54 = arith.constant 54 : index
%c53 = arith.constant 53 : index
%c52 = arith.constant 52 : index
%c51 = arith.constant 51 : index
%c50 = arith.constant 50 : index
%c49 = arith.constant 49 : index
%c48 = arith.constant 48 : index
%c47 = arith.constant 47 : index
%c46 = arith.constant 46 : index
%c45 = arith.constant 45 : index
%c44 = arith.constant 44 : index
%c43 = arith.constant 43 : index
%c42 = arith.constant 42 : index
%c41 = arith.constant 41 : index
%c40 = arith.constant 40 : index
%c39 = arith.constant 39 : index
%c38 = arith.constant 38 : index
%c37 = arith.constant 37 : index
%c36 = arith.constant 36 : index
%c35 = arith.constant 35 : index
%c34 = arith.constant 34 : index
%c33 = arith.constant 33 : index
%c32 = arith.constant 32 : index
%c31 = arith.constant 31 : index
%c30 = arith.constant 30 : index
%c29 = arith.constant 29 : index
%c28 = arith.constant 28 : index
%c27 = arith.constant 27 : index
%c26 = arith.constant 26 : index
%c25 = arith.constant 25 : index
%c24 = arith.constant 24 : index
%c23 = arith.constant 23 : index
%c22 = arith.constant 22 : index
%c21 = arith.constant 21 : index
%c20 = arith.constant 20 : index
%c19 = arith.constant 19 : index
%c18 = arith.constant 18 : index
%c17 = arith.constant 17 : index
%c15 = arith.constant 15 : index
%c14 = arith.constant 14 : index
%c13 = arith.constant 13 : index
%c12 = arith.constant 12 : index
%c11 = arith.constant 11 : index
%c10 = arith.constant 10 : index
%c9 = arith.constant 9 : index
%c8 = arith.constant 8 : index
%c7 = arith.constant 7 : index
%c6 = arith.constant 6 : index
%c5 = arith.constant 5 : index
%c4 = arith.constant 4 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c540 = arith.constant 540 : index
%c3200 = arith.constant 3200 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x64x16xf16>
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %14, 64 : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%13}
memref.assume_alignment %15, 1 : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %16 to %13 step %17 {
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%subview = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_1 = memref.subview %14[%23, %arg2] [64, 64] [1, 1] : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %22 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c16 {
%24 = vector.load %subview_1[%c0, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%25 = vector.load %subview_1[%c1, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%26 = vector.load %subview_1[%c2, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%27 = vector.load %subview_1[%c3, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%28 = vector.load %subview_1[%c4, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%29 = vector.load %subview_1[%c5, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%30 = vector.load %subview_1[%c6, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%31 = vector.load %subview_1[%c7, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%32 = vector.load %subview_1[%c8, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%33 = vector.load %subview_1[%c9, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%34 = vector.load %subview_1[%c10, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%35 = vector.load %subview_1[%c11, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%36 = vector.load %subview_1[%c12, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%37 = vector.load %subview_1[%c13, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%38 = vector.load %subview_1[%c14, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%39 = vector.load %subview_1[%c15, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%40 = vector.load %subview_1[%c16, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%41 = vector.load %subview_1[%c17, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%42 = vector.load %subview_1[%c18, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%43 = vector.load %subview_1[%c19, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%44 = vector.load %subview_1[%c20, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%45 = vector.load %subview_1[%c21, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%46 = vector.load %subview_1[%c22, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%47 = vector.load %subview_1[%c23, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%48 = vector.load %subview_1[%c24, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%49 = vector.load %subview_1[%c25, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%50 = vector.load %subview_1[%c26, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%51 = vector.load %subview_1[%c27, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%52 = vector.load %subview_1[%c28, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%53 = vector.load %subview_1[%c29, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%54 = vector.load %subview_1[%c30, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%55 = vector.load %subview_1[%c31, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%56 = vector.load %subview_1[%c32, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%57 = vector.load %subview_1[%c33, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%58 = vector.load %subview_1[%c34, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%59 = vector.load %subview_1[%c35, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%60 = vector.load %subview_1[%c36, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%61 = vector.load %subview_1[%c37, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%62 = vector.load %subview_1[%c38, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%63 = vector.load %subview_1[%c39, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%64 = vector.load %subview_1[%c40, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%65 = vector.load %subview_1[%c41, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%66 = vector.load %subview_1[%c42, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%67 = vector.load %subview_1[%c43, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%68 = vector.load %subview_1[%c44, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%69 = vector.load %subview_1[%c45, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%70 = vector.load %subview_1[%c46, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%71 = vector.load %subview_1[%c47, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%72 = vector.load %subview_1[%c48, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%73 = vector.load %subview_1[%c49, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%74 = vector.load %subview_1[%c50, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%75 = vector.load %subview_1[%c51, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%76 = vector.load %subview_1[%c52, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%77 = vector.load %subview_1[%c53, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%78 = vector.load %subview_1[%c54, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%79 = vector.load %subview_1[%c55, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%80 = vector.load %subview_1[%c56, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%81 = vector.load %subview_1[%c57, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%82 = vector.load %subview_1[%c58, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%83 = vector.load %subview_1[%c59, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%84 = vector.load %subview_1[%c60, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%85 = vector.load %subview_1[%c61, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%86 = vector.load %subview_1[%c62, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%87 = vector.load %subview_1[%c63, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%subview_2 = memref.subview %alloca[0, 0, 0] [1, 64, 16] [1, 1, 1] : memref<1x64x16xf16> to memref<64x16xf16>
vector.store %24, %subview_2[%c0, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %25, %subview_2[%c1, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %26, %subview_2[%c2, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %27, %subview_2[%c3, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %28, %subview_2[%c4, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %29, %subview_2[%c5, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %30, %subview_2[%c6, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %31, %subview_2[%c7, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %32, %subview_2[%c8, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %33, %subview_2[%c9, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %34, %subview_2[%c10, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %35, %subview_2[%c11, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %36, %subview_2[%c12, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %37, %subview_2[%c13, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %38, %subview_2[%c14, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %39, %subview_2[%c15, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %40, %subview_2[%c16, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %41, %subview_2[%c17, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %42, %subview_2[%c18, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %43, %subview_2[%c19, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %44, %subview_2[%c20, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %45, %subview_2[%c21, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %46, %subview_2[%c22, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %47, %subview_2[%c23, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %48, %subview_2[%c24, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %49, %subview_2[%c25, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %50, %subview_2[%c26, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %51, %subview_2[%c27, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %52, %subview_2[%c28, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %53, %subview_2[%c29, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %54, %subview_2[%c30, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %55, %subview_2[%c31, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %56, %subview_2[%c32, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %57, %subview_2[%c33, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %58, %subview_2[%c34, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %59, %subview_2[%c35, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %60, %subview_2[%c36, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %61, %subview_2[%c37, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %62, %subview_2[%c38, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %63, %subview_2[%c39, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %64, %subview_2[%c40, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %65, %subview_2[%c41, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %66, %subview_2[%c42, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %67, %subview_2[%c43, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %68, %subview_2[%c44, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %69, %subview_2[%c45, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %70, %subview_2[%c46, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %71, %subview_2[%c47, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %72, %subview_2[%c48, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %73, %subview_2[%c49, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %74, %subview_2[%c50, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %75, %subview_2[%c51, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %76, %subview_2[%c52, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %77, %subview_2[%c53, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %78, %subview_2[%c54, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %79, %subview_2[%c55, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %80, %subview_2[%c56, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %81, %subview_2[%c57, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %82, %subview_2[%c58, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %83, %subview_2[%c59, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %84, %subview_2[%c60, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %85, %subview_2[%c61, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %86, %subview_2[%c62, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %87, %subview_2[%c63, %c0] : memref<64x16xf16>, vector<16xf16>
%expand_shape = memref.expand_shape %alloca [[0], [1, 2], [3, 4]] : memref<1x64x16xf16> into memref<1x4x16x16x1xf16>
%subview_3 = memref.subview %expand_shape[0, 0, 0, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : memref<1x4x16x16x1xf16> to memref<1x4x16x16xf16, strided<[1024, 256, 16, 1]>>
%subview_4 = memref.subview %subview_3[0, 0, 0, 0] [1, 4, 16, 16] [1, 1, 1, 1] : memref<1x4x16x16xf16, strided<[1024, 256, 16, 1]>> to memref<4x16x16xf16>
%88 = vector.load %subview_4[%c0, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%89 = vector.insert %88, %cst_0 [0, 0] : vector<16xf16> into vector<4x16x16xf16>
%90 = vector.load %subview_4[%c0, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%91 = vector.insert %90, %89 [0, 1] : vector<16xf16> into vector<4x16x16xf16>
%92 = vector.load %subview_4[%c0, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%93 = vector.insert %92, %91 [0, 2] : vector<16xf16> into vector<4x16x16xf16>
%94 = vector.load %subview_4[%c0, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%95 = vector.insert %94, %93 [0, 3] : vector<16xf16> into vector<4x16x16xf16>
%96 = vector.load %subview_4[%c0, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%97 = vector.insert %96, %95 [0, 4] : vector<16xf16> into vector<4x16x16xf16>
%98 = vector.load %subview_4[%c0, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%99 = vector.insert %98, %97 [0, 5] : vector<16xf16> into vector<4x16x16xf16>
%100 = vector.load %subview_4[%c0, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%101 = vector.insert %100, %99 [0, 6] : vector<16xf16> into vector<4x16x16xf16>
%102 = vector.load %subview_4[%c0, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%103 = vector.insert %102, %101 [0, 7] : vector<16xf16> into vector<4x16x16xf16>
%104 = vector.load %subview_4[%c0, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%105 = vector.insert %104, %103 [0, 8] : vector<16xf16> into vector<4x16x16xf16>
%106 = vector.load %subview_4[%c0, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%107 = vector.insert %106, %105 [0, 9] : vector<16xf16> into vector<4x16x16xf16>
%108 = vector.load %subview_4[%c0, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%109 = vector.insert %108, %107 [0, 10] : vector<16xf16> into vector<4x16x16xf16>
%110 = vector.load %subview_4[%c0, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%111 = vector.insert %110, %109 [0, 11] : vector<16xf16> into vector<4x16x16xf16>
%112 = vector.load %subview_4[%c0, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%113 = vector.insert %112, %111 [0, 12] : vector<16xf16> into vector<4x16x16xf16>
%114 = vector.load %subview_4[%c0, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%115 = vector.insert %114, %113 [0, 13] : vector<16xf16> into vector<4x16x16xf16>
%116 = vector.load %subview_4[%c0, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%117 = vector.insert %116, %115 [0, 14] : vector<16xf16> into vector<4x16x16xf16>
%118 = vector.load %subview_4[%c0, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%119 = vector.insert %118, %117 [0, 15] : vector<16xf16> into vector<4x16x16xf16>
%120 = vector.load %subview_4[%c1, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%121 = vector.insert %120, %119 [1, 0] : vector<16xf16> into vector<4x16x16xf16>
%122 = vector.load %subview_4[%c1, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%123 = vector.insert %122, %121 [1, 1] : vector<16xf16> into vector<4x16x16xf16>
%124 = vector.load %subview_4[%c1, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%125 = vector.insert %124, %123 [1, 2] : vector<16xf16> into vector<4x16x16xf16>
%126 = vector.load %subview_4[%c1, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%127 = vector.insert %126, %125 [1, 3] : vector<16xf16> into vector<4x16x16xf16>
%128 = vector.load %subview_4[%c1, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%129 = vector.insert %128, %127 [1, 4] : vector<16xf16> into vector<4x16x16xf16>
%130 = vector.load %subview_4[%c1, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%131 = vector.insert %130, %129 [1, 5] : vector<16xf16> into vector<4x16x16xf16>
%132 = vector.load %subview_4[%c1, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%133 = vector.insert %132, %131 [1, 6] : vector<16xf16> into vector<4x16x16xf16>
%134 = vector.load %subview_4[%c1, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%135 = vector.insert %134, %133 [1, 7] : vector<16xf16> into vector<4x16x16xf16>
%136 = vector.load %subview_4[%c1, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%137 = vector.insert %136, %135 [1, 8] : vector<16xf16> into vector<4x16x16xf16>
%138 = vector.load %subview_4[%c1, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%139 = vector.insert %138, %137 [1, 9] : vector<16xf16> into vector<4x16x16xf16>
%140 = vector.load %subview_4[%c1, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%141 = vector.insert %140, %139 [1, 10] : vector<16xf16> into vector<4x16x16xf16>
%142 = vector.load %subview_4[%c1, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%143 = vector.insert %142, %141 [1, 11] : vector<16xf16> into vector<4x16x16xf16>
%144 = vector.load %subview_4[%c1, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%145 = vector.insert %144, %143 [1, 12] : vector<16xf16> into vector<4x16x16xf16>
%146 = vector.load %subview_4[%c1, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%147 = vector.insert %146, %145 [1, 13] : vector<16xf16> into vector<4x16x16xf16>
%148 = vector.load %subview_4[%c1, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%149 = vector.insert %148, %147 [1, 14] : vector<16xf16> into vector<4x16x16xf16>
%150 = vector.load %subview_4[%c1, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%151 = vector.insert %150, %149 [1, 15] : vector<16xf16> into vector<4x16x16xf16>
%152 = vector.load %subview_4[%c2, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%153 = vector.insert %152, %151 [2, 0] : vector<16xf16> into vector<4x16x16xf16>
%154 = vector.load %subview_4[%c2, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%155 = vector.insert %154, %153 [2, 1] : vector<16xf16> into vector<4x16x16xf16>
%156 = vector.load %subview_4[%c2, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%157 = vector.insert %156, %155 [2, 2] : vector<16xf16> into vector<4x16x16xf16>
%158 = vector.load %subview_4[%c2, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%159 = vector.insert %158, %157 [2, 3] : vector<16xf16> into vector<4x16x16xf16>
%160 = vector.load %subview_4[%c2, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%161 = vector.insert %160, %159 [2, 4] : vector<16xf16> into vector<4x16x16xf16>
%162 = vector.load %subview_4[%c2, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%163 = vector.insert %162, %161 [2, 5] : vector<16xf16> into vector<4x16x16xf16>
%164 = vector.load %subview_4[%c2, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%165 = vector.insert %164, %163 [2, 6] : vector<16xf16> into vector<4x16x16xf16>
%166 = vector.load %subview_4[%c2, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%167 = vector.insert %166, %165 [2, 7] : vector<16xf16> into vector<4x16x16xf16>
%168 = vector.load %subview_4[%c2, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%169 = vector.insert %168, %167 [2, 8] : vector<16xf16> into vector<4x16x16xf16>
%170 = vector.load %subview_4[%c2, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%171 = vector.insert %170, %169 [2, 9] : vector<16xf16> into vector<4x16x16xf16>
%172 = vector.load %subview_4[%c2, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%173 = vector.insert %172, %171 [2, 10] : vector<16xf16> into vector<4x16x16xf16>
%174 = vector.load %subview_4[%c2, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%175 = vector.insert %174, %173 [2, 11] : vector<16xf16> into vector<4x16x16xf16>
%176 = vector.load %subview_4[%c2, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%177 = vector.insert %176, %175 [2, 12] : vector<16xf16> into vector<4x16x16xf16>
%178 = vector.load %subview_4[%c2, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%179 = vector.insert %178, %177 [2, 13] : vector<16xf16> into vector<4x16x16xf16>
%180 = vector.load %subview_4[%c2, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%181 = vector.insert %180, %179 [2, 14] : vector<16xf16> into vector<4x16x16xf16>
%182 = vector.load %subview_4[%c2, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%183 = vector.insert %182, %181 [2, 15] : vector<16xf16> into vector<4x16x16xf16>
%184 = vector.load %subview_4[%c3, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%185 = vector.insert %184, %183 [3, 0] : vector<16xf16> into vector<4x16x16xf16>
%186 = vector.load %subview_4[%c3, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%187 = vector.insert %186, %185 [3, 1] : vector<16xf16> into vector<4x16x16xf16>
%188 = vector.load %subview_4[%c3, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%189 = vector.insert %188, %187 [3, 2] : vector<16xf16> into vector<4x16x16xf16>
%190 = vector.load %subview_4[%c3, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%191 = vector.insert %190, %189 [3, 3] : vector<16xf16> into vector<4x16x16xf16>
%192 = vector.load %subview_4[%c3, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%193 = vector.insert %192, %191 [3, 4] : vector<16xf16> into vector<4x16x16xf16>
%194 = vector.load %subview_4[%c3, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%195 = vector.insert %194, %193 [3, 5] : vector<16xf16> into vector<4x16x16xf16>
%196 = vector.load %subview_4[%c3, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%197 = vector.insert %196, %195 [3, 6] : vector<16xf16> into vector<4x16x16xf16>
%198 = vector.load %subview_4[%c3, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%199 = vector.insert %198, %197 [3, 7] : vector<16xf16> into vector<4x16x16xf16>
%200 = vector.load %subview_4[%c3, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%201 = vector.insert %200, %199 [3, 8] : vector<16xf16> into vector<4x16x16xf16>
%202 = vector.load %subview_4[%c3, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%203 = vector.insert %202, %201 [3, 9] : vector<16xf16> into vector<4x16x16xf16>
%204 = vector.load %subview_4[%c3, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%205 = vector.insert %204, %203 [3, 10] : vector<16xf16> into vector<4x16x16xf16>
%206 = vector.load %subview_4[%c3, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%207 = vector.insert %206, %205 [3, 11] : vector<16xf16> into vector<4x16x16xf16>
%208 = vector.load %subview_4[%c3, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%209 = vector.insert %208, %207 [3, 12] : vector<16xf16> into vector<4x16x16xf16>
%210 = vector.load %subview_4[%c3, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%211 = vector.insert %210, %209 [3, 13] : vector<16xf16> into vector<4x16x16xf16>
%212 = vector.load %subview_4[%c3, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%213 = vector.insert %212, %211 [3, 14] : vector<16xf16> into vector<4x16x16xf16>
%214 = vector.load %subview_4[%c3, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%215 = vector.insert %214, %213 [3, 15] : vector<16xf16> into vector<4x16x16xf16>
%216 = vector.shape_cast %215 : vector<4x16x16xf16> to vector<4x16x16x1xf16>
%217 = vector.extract %216[0, 0, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%218 = vector.insert %217, %cst [0, 0, 0, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%219 = vector.extract %216[0, 0, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%220 = vector.insert %219, %218 [0, 0, 1, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%221 = vector.extract %216[0, 0, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%222 = vector.insert %221, %220 [0, 0, 2, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%223 = vector.extract %216[0, 0, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%224 = vector.insert %223, %222 [0, 0, 3, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%225 = vector.extract %216[0, 0, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%226 = vector.insert %225, %224 [0, 0, 4, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%227 = vector.extract %216[0, 0, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%228 = vector.insert %227, %226 [0, 0, 5, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%229 = vector.extract %216[0, 0, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%230 = vector.insert %229, %228 [0, 0, 6, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%231 = vector.extract %216[0, 0, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%232 = vector.insert %231, %230 [0, 0, 7, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%233 = vector.extract %216[0, 0, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%234 = vector.insert %233, %232 [0, 0, 8, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%235 = vector.extract %216[0, 0, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%236 = vector.insert %235, %234 [0, 0, 9, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%237 = vector.extract %216[0, 0, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%238 = vector.insert %237, %236 [0, 0, 10, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%239 = vector.extract %216[0, 0, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%240 = vector.insert %239, %238 [0, 0, 11, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%241 = vector.extract %216[0, 0, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%242 = vector.insert %241, %240 [0, 0, 12, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%243 = vector.extract %216[0, 0, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%244 = vector.insert %243, %242 [0, 0, 13, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%245 = vector.extract %216[0, 0, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%246 = vector.insert %245, %244 [0, 0, 14, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%247 = vector.extract %216[0, 0, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%248 = vector.insert %247, %246 [0, 0, 15, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%249 = vector.extract %216[0, 1, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%250 = vector.insert %249, %248 [0, 0, 0, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%251 = vector.extract %216[0, 1, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%252 = vector.insert %251, %250 [0, 0, 1, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%253 = vector.extract %216[0, 1, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%254 = vector.insert %253, %252 [0, 0, 2, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%255 = vector.extract %216[0, 1, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%256 = vector.insert %255, %254 [0, 0, 3, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%257 = vector.extract %216[0, 1, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%258 = vector.insert %257, %256 [0, 0, 4, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%259 = vector.extract %216[0, 1, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%260 = vector.insert %259, %258 [0, 0, 5, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%261 = vector.extract %216[0, 1, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%262 = vector.insert %261, %260 [0, 0, 6, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%263 = vector.extract %216[0, 1, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%264 = vector.insert %263, %262 [0, 0, 7, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%265 = vector.extract %216[0, 1, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%266 = vector.insert %265, %264 [0, 0, 8, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%267 = vector.extract %216[0, 1, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%268 = vector.insert %267, %266 [0, 0, 9, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%269 = vector.extract %216[0, 1, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%270 = vector.insert %269, %268 [0, 0, 10, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%271 = vector.extract %216[0, 1, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%272 = vector.insert %271, %270 [0, 0, 11, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%273 = vector.extract %216[0, 1, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%274 = vector.insert %273, %272 [0, 0, 12, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%275 = vector.extract %216[0, 1, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%276 = vector.insert %275, %274 [0, 0, 13, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%277 = vector.extract %216[0, 1, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%278 = vector.insert %277, %276 [0, 0, 14, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%279 = vector.extract %216[0, 1, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%280 = vector.insert %279, %278 [0, 0, 15, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%281 = vector.extract %216[0, 2, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%282 = vector.insert %281, %280 [0, 0, 0, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%283 = vector.extract %216[0, 2, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%284 = vector.insert %283, %282 [0, 0, 1, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%285 = vector.extract %216[0, 2, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%286 = vector.insert %285, %284 [0, 0, 2, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%287 = vector.extract %216[0, 2, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%288 = vector.insert %287, %286 [0, 0, 3, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%289 = vector.extract %216[0, 2, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%290 = vector.insert %289, %288 [0, 0, 4, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%291 = vector.extract %216[0, 2, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%292 = vector.insert %291, %290 [0, 0, 5, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%293 = vector.extract %216[0, 2, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%294 = vector.insert %293, %292 [0, 0, 6, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%295 = vector.extract %216[0, 2, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%296 = vector.insert %295, %294 [0, 0, 7, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%297 = vector.extract %216[0, 2, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%298 = vector.insert %297, %296 [0, 0, 8, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%299 = vector.extract %216[0, 2, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%300 = vector.insert %299, %298 [0, 0, 9, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%301 = vector.extract %216[0, 2, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%302 = vector.insert %301, %300 [0, 0, 10, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%303 = vector.extract %216[0, 2, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%304 = vector.insert %303, %302 [0, 0, 11, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%305 = vector.extract %216[0, 2, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%306 = vector.insert %305, %304 [0, 0, 12, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%307 = vector.extract %216[0, 2, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%308 = vector.insert %307, %306 [0, 0, 13, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%309 = vector.extract %216[0, 2, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%310 = vector.insert %309, %308 [0, 0, 14, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%311 = vector.extract %216[0, 2, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%312 = vector.insert %311, %310 [0, 0, 15, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%313 = vector.extract %216[0, 3, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%314 = vector.insert %313, %312 [0, 0, 0, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%315 = vector.extract %216[0, 3, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%316 = vector.insert %315, %314 [0, 0, 1, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%317 = vector.extract %216[0, 3, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%318 = vector.insert %317, %316 [0, 0, 2, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%319 = vector.extract %216[0, 3, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%320 = vector.insert %319, %318 [0, 0, 3, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%321 = vector.extract %216[0, 3, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%322 = vector.insert %321, %320 [0, 0, 4, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%323 = vector.extract %216[0, 3, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%324 = vector.insert %323, %322 [0, 0, 5, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%325 = vector.extract %216[0, 3, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%326 = vector.insert %325, %324 [0, 0, 6, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%327 = vector.extract %216[0, 3, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%328 = vector.insert %327, %326 [0, 0, 7, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%329 = vector.extract %216[0, 3, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%330 = vector.insert %329, %328 [0, 0, 8, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%331 = vector.extract %216[0, 3, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%332 = vector.insert %331, %330 [0, 0, 9, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%333 = vector.extract %216[0, 3, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%334 = vector.insert %333, %332 [0, 0, 10, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%335 = vector.extract %216[0, 3, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%336 = vector.insert %335, %334 [0, 0, 11, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%337 = vector.extract %216[0, 3, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%338 = vector.insert %337, %336 [0, 0, 12, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%339 = vector.extract %216[0, 3, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%340 = vector.insert %339, %338 [0, 0, 13, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%341 = vector.extract %216[0, 3, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%342 = vector.insert %341, %340 [0, 0, 14, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%343 = vector.extract %216[0, 3, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%344 = vector.insert %343, %342 [0, 0, 15, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%345 = vector.extract %216[0, 4, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%346 = vector.insert %345, %344 [0, 0, 0, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%347 = vector.extract %216[0, 4, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%348 = vector.insert %347, %346 [0, 0, 1, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%349 = vector.extract %216[0, 4, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%350 = vector.insert %349, %348 [0, 0, 2, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%351 = vector.extract %216[0, 4, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%352 = vector.insert %351, %350 [0, 0, 3, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%353 = vector.extract %216[0, 4, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%354 = vector.insert %353, %352 [0, 0, 4, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%355 = vector.extract %216[0, 4, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%356 = vector.insert %355, %354 [0, 0, 5, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%357 = vector.extract %216[0, 4, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%358 = vector.insert %357, %356 [0, 0, 6, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%359 = vector.extract %216[0, 4, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%360 = vector.insert %359, %358 [0, 0, 7, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%361 = vector.extract %216[0, 4, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%362 = vector.insert %361, %360 [0, 0, 8, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%363 = vector.extract %216[0, 4, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%364 = vector.insert %363, %362 [0, 0, 9, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%365 = vector.extract %216[0, 4, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%366 = vector.insert %365, %364 [0, 0, 10, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%367 = vector.extract %216[0, 4, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%368 = vector.insert %367, %366 [0, 0, 11, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%369 = vector.extract %216[0, 4, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%370 = vector.insert %369, %368 [0, 0, 12, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%371 = vector.extract %216[0, 4, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%372 = vector.insert %371, %370 [0, 0, 13, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%373 = vector.extract %216[0, 4, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%374 = vector.insert %373, %372 [0, 0, 14, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%375 = vector.extract %216[0, 4, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%376 = vector.insert %375, %374 [0, 0, 15, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%377 = vector.extract %216[0, 5, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%378 = vector.insert %377, %376 [0, 0, 0, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%379 = vector.extract %216[0, 5, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%380 = vector.insert %379, %378 [0, 0, 1, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%381 = vector.extract %216[0, 5, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%382 = vector.insert %381, %380 [0, 0, 2, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%383 = vector.extract %216[0, 5, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%384 = vector.insert %383, %382 [0, 0, 3, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%385 = vector.extract %216[0, 5, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%386 = vector.insert %385, %384 [0, 0, 4, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%387 = vector.extract %216[0, 5, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%388 = vector.insert %387, %386 [0, 0, 5, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%389 = vector.extract %216[0, 5, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%390 = vector.insert %389, %388 [0, 0, 6, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%391 = vector.extract %216[0, 5, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%392 = vector.insert %391, %390 [0, 0, 7, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%393 = vector.extract %216[0, 5, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%394 = vector.insert %393, %392 [0, 0, 8, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%395 = vector.extract %216[0, 5, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%396 = vector.insert %395, %394 [0, 0, 9, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%397 = vector.extract %216[0, 5, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%398 = vector.insert %397, %396 [0, 0, 10, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%399 = vector.extract %216[0, 5, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%400 = vector.insert %399, %398 [0, 0, 11, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%401 = vector.extract %216[0, 5, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%402 = vector.insert %401, %400 [0, 0, 12, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%403 = vector.extract %216[0, 5, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%404 = vector.insert %403, %402 [0, 0, 13, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%405 = vector.extract %216[0, 5, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%406 = vector.insert %405, %404 [0, 0, 14, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%407 = vector.extract %216[0, 5, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%408 = vector.insert %407, %406 [0, 0, 15, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%409 = vector.extract %216[0, 6, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%410 = vector.insert %409, %408 [0, 0, 0, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%411 = vector.extract %216[0, 6, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%412 = vector.insert %411, %410 [0, 0, 1, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%413 = vector.extract %216[0, 6, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%414 = vector.insert %413, %412 [0, 0, 2, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%415 = vector.extract %216[0, 6, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%416 = vector.insert %415, %414 [0, 0, 3, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%417 = vector.extract %216[0, 6, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%418 = vector.insert %417, %416 [0, 0, 4, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%419 = vector.extract %216[0, 6, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%420 = vector.insert %419, %418 [0, 0, 5, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%421 = vector.extract %216[0, 6, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%422 = vector.insert %421, %420 [0, 0, 6, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%423 = vector.extract %216[0, 6, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%424 = vector.insert %423, %422 [0, 0, 7, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%425 = vector.extract %216[0, 6, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%426 = vector.insert %425, %424 [0, 0, 8, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%427 = vector.extract %216[0, 6, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%428 = vector.insert %427, %426 [0, 0, 9, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%429 = vector.extract %216[0, 6, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%430 = vector.insert %429, %428 [0, 0, 10, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%431 = vector.extract %216[0, 6, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%432 = vector.insert %431, %430 [0, 0, 11, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%433 = vector.extract %216[0, 6, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%434 = vector.insert %433, %432 [0, 0, 12, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%435 = vector.extract %216[0, 6, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%436 = vector.insert %435, %434 [0, 0, 13, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%437 = vector.extract %216[0, 6, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%438 = vector.insert %437, %436 [0, 0, 14, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%439 = vector.extract %216[0, 6, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%440 = vector.insert %439, %438 [0, 0, 15, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%441 = vector.extract %216[0, 7, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%442 = vector.insert %441, %440 [0, 0, 0, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%443 = vector.extract %216[0, 7, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%444 = vector.insert %443, %442 [0, 0, 1, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%445 = vector.extract %216[0, 7, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%446 = vector.insert %445, %444 [0, 0, 2, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%447 = vector.extract %216[0, 7, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%448 = vector.insert %447, %446 [0, 0, 3, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%449 = vector.extract %216[0, 7, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%450 = vector.insert %449, %448 [0, 0, 4, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%451 = vector.extract %216[0, 7, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%452 = vector.insert %451, %450 [0, 0, 5, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%453 = vector.extract %216[0, 7, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%454 = vector.insert %453, %452 [0, 0, 6, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%455 = vector.extract %216[0, 7, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%456 = vector.insert %455, %454 [0, 0, 7, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%457 = vector.extract %216[0, 7, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%458 = vector.insert %457, %456 [0, 0, 8, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%459 = vector.extract %216[0, 7, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%460 = vector.insert %459, %458 [0, 0, 9, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%461 = vector.extract %216[0, 7, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%462 = vector.insert %461, %460 [0, 0, 10, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%463 = vector.extract %216[0, 7, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%464 = vector.insert %463, %462 [0, 0, 11, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%465 = vector.extract %216[0, 7, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%466 = vector.insert %465, %464 [0, 0, 12, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%467 = vector.extract %216[0, 7, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%468 = vector.insert %467, %466 [0, 0, 13, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%469 = vector.extract %216[0, 7, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%470 = vector.insert %469, %468 [0, 0, 14, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%471 = vector.extract %216[0, 7, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%472 = vector.insert %471, %470 [0, 0, 15, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%473 = vector.extract %216[0, 8, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%474 = vector.insert %473, %472 [0, 0, 0, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%475 = vector.extract %216[0, 8, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%476 = vector.insert %475, %474 [0, 0, 1, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%477 = vector.extract %216[0, 8, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%478 = vector.insert %477, %476 [0, 0, 2, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%479 = vector.extract %216[0, 8, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%480 = vector.insert %479, %478 [0, 0, 3, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%481 = vector.extract %216[0, 8, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%482 = vector.insert %481, %480 [0, 0, 4, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%483 = vector.extract %216[0, 8, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%484 = vector.insert %483, %482 [0, 0, 5, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%485 = vector.extract %216[0, 8, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%486 = vector.insert %485, %484 [0, 0, 6, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%487 = vector.extract %216[0, 8, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%488 = vector.insert %487, %486 [0, 0, 7, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%489 = vector.extract %216[0, 8, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%490 = vector.insert %489, %488 [0, 0, 8, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%491 = vector.extract %216[0, 8, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%492 = vector.insert %491, %490 [0, 0, 9, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%493 = vector.extract %216[0, 8, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%494 = vector.insert %493, %492 [0, 0, 10, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%495 = vector.extract %216[0, 8, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%496 = vector.insert %495, %494 [0, 0, 11, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%497 = vector.extract %216[0, 8, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%498 = vector.insert %497, %496 [0, 0, 12, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%499 = vector.extract %216[0, 8, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%500 = vector.insert %499, %498 [0, 0, 13, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%501 = vector.extract %216[0, 8, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%502 = vector.insert %501, %500 [0, 0, 14, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%503 = vector.extract %216[0, 8, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%504 = vector.insert %503, %502 [0, 0, 15, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%505 = vector.extract %216[0, 9, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%506 = vector.insert %505, %504 [0, 0, 0, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%507 = vector.extract %216[0, 9, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%508 = vector.insert %507, %506 [0, 0, 1, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%509 = vector.extract %216[0, 9, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%510 = vector.insert %509, %508 [0, 0, 2, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%511 = vector.extract %216[0, 9, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%512 = vector.insert %511, %510 [0, 0, 3, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%513 = vector.extract %216[0, 9, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%514 = vector.insert %513, %512 [0, 0, 4, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%515 = vector.extract %216[0, 9, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%516 = vector.insert %515, %514 [0, 0, 5, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%517 = vector.extract %216[0, 9, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%518 = vector.insert %517, %516 [0, 0, 6, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%519 = vector.extract %216[0, 9, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%520 = vector.insert %519, %518 [0, 0, 7, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%521 = vector.extract %216[0, 9, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%522 = vector.insert %521, %520 [0, 0, 8, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%523 = vector.extract %216[0, 9, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%524 = vector.insert %523, %522 [0, 0, 9, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%525 = vector.extract %216[0, 9, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%526 = vector.insert %525, %524 [0, 0, 10, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%527 = vector.extract %216[0, 9, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%528 = vector.insert %527, %526 [0, 0, 11, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%529 = vector.extract %216[0, 9, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%530 = vector.insert %529, %528 [0, 0, 12, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%531 = vector.extract %216[0, 9, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%532 = vector.insert %531, %530 [0, 0, 13, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%533 = vector.extract %216[0, 9, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%534 = vector.insert %533, %532 [0, 0, 14, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%535 = vector.extract %216[0, 9, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%536 = vector.insert %535, %534 [0, 0, 15, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%537 = vector.extract %216[0, 10, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%538 = vector.insert %537, %536 [0, 0, 0, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%539 = vector.extract %216[0, 10, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%540 = vector.insert %539, %538 [0, 0, 1, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%541 = vector.extract %216[0, 10, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%542 = vector.insert %541, %540 [0, 0, 2, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%543 = vector.extract %216[0, 10, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%544 = vector.insert %543, %542 [0, 0, 3, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%545 = vector.extract %216[0, 10, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%546 = vector.insert %545, %544 [0, 0, 4, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%547 = vector.extract %216[0, 10, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%548 = vector.insert %547, %546 [0, 0, 5, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%549 = vector.extract %216[0, 10, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%550 = vector.insert %549, %548 [0, 0, 6, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%551 = vector.extract %216[0, 10, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%552 = vector.insert %551, %550 [0, 0, 7, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%553 = vector.extract %216[0, 10, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%554 = vector.insert %553, %552 [0, 0, 8, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%555 = vector.extract %216[0, 10, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%556 = vector.insert %555, %554 [0, 0, 9, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%557 = vector.extract %216[0, 10, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%558 = vector.insert %557, %556 [0, 0, 10, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%559 = vector.extract %216[0, 10, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%560 = vector.insert %559, %558 [0, 0, 11, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%561 = vector.extract %216[0, 10, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%562 = vector.insert %561, %560 [0, 0, 12, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%563 = vector.extract %216[0, 10, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%564 = vector.insert %563, %562 [0, 0, 13, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%565 = vector.extract %216[0, 10, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%566 = vector.insert %565, %564 [0, 0, 14, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%567 = vector.extract %216[0, 10, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%568 = vector.insert %567, %566 [0, 0, 15, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%569 = vector.extract %216[0, 11, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%570 = vector.insert %569, %568 [0, 0, 0, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%571 = vector.extract %216[0, 11, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%572 = vector.insert %571, %570 [0, 0, 1, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%573 = vector.extract %216[0, 11, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%574 = vector.insert %573, %572 [0, 0, 2, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%575 = vector.extract %216[0, 11, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%576 = vector.insert %575, %574 [0, 0, 3, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%577 = vector.extract %216[0, 11, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%578 = vector.insert %577, %576 [0, 0, 4, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%579 = vector.extract %216[0, 11, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%580 = vector.insert %579, %578 [0, 0, 5, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%581 = vector.extract %216[0, 11, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%582 = vector.insert %581, %580 [0, 0, 6, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%583 = vector.extract %216[0, 11, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%584 = vector.insert %583, %582 [0, 0, 7, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%585 = vector.extract %216[0, 11, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%586 = vector.insert %585, %584 [0, 0, 8, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%587 = vector.extract %216[0, 11, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%588 = vector.insert %587, %586 [0, 0, 9, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%589 = vector.extract %216[0, 11, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%590 = vector.insert %589, %588 [0, 0, 10, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%591 = vector.extract %216[0, 11, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%592 = vector.insert %591, %590 [0, 0, 11, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%593 = vector.extract %216[0, 11, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%594 = vector.insert %593, %592 [0, 0, 12, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%595 = vector.extract %216[0, 11, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%596 = vector.insert %595, %594 [0, 0, 13, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%597 = vector.extract %216[0, 11, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%598 = vector.insert %597, %596 [0, 0, 14, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%599 = vector.extract %216[0, 11, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%600 = vector.insert %599, %598 [0, 0, 15, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%601 = vector.extract %216[0, 12, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%602 = vector.insert %601, %600 [0, 0, 0, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%603 = vector.extract %216[0, 12, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%604 = vector.insert %603, %602 [0, 0, 1, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%605 = vector.extract %216[0, 12, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%606 = vector.insert %605, %604 [0, 0, 2, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%607 = vector.extract %216[0, 12, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%608 = vector.insert %607, %606 [0, 0, 3, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%609 = vector.extract %216[0, 12, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%610 = vector.insert %609, %608 [0, 0, 4, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%611 = vector.extract %216[0, 12, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%612 = vector.insert %611, %610 [0, 0, 5, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%613 = vector.extract %216[0, 12, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%614 = vector.insert %613, %612 [0, 0, 6, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%615 = vector.extract %216[0, 12, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%616 = vector.insert %615, %614 [0, 0, 7, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%617 = vector.extract %216[0, 12, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%618 = vector.insert %617, %616 [0, 0, 8, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%619 = vector.extract %216[0, 12, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%620 = vector.insert %619, %618 [0, 0, 9, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%621 = vector.extract %216[0, 12, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%622 = vector.insert %621, %620 [0, 0, 10, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%623 = vector.extract %216[0, 12, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%624 = vector.insert %623, %622 [0, 0, 11, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%625 = vector.extract %216[0, 12, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%626 = vector.insert %625, %624 [0, 0, 12, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%627 = vector.extract %216[0, 12, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%628 = vector.insert %627, %626 [0, 0, 13, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%629 = vector.extract %216[0, 12, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%630 = vector.insert %629, %628 [0, 0, 14, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%631 = vector.extract %216[0, 12, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%632 = vector.insert %631, %630 [0, 0, 15, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%633 = vector.extract %216[0, 13, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%634 = vector.insert %633, %632 [0, 0, 0, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%635 = vector.extract %216[0, 13, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%636 = vector.insert %635, %634 [0, 0, 1, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%637 = vector.extract %216[0, 13, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%638 = vector.insert %637, %636 [0, 0, 2, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%639 = vector.extract %216[0, 13, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%640 = vector.insert %639, %638 [0, 0, 3, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%641 = vector.extract %216[0, 13, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%642 = vector.insert %641, %640 [0, 0, 4, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%643 = vector.extract %216[0, 13, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%644 = vector.insert %643, %642 [0, 0, 5, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%645 = vector.extract %216[0, 13, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%646 = vector.insert %645, %644 [0, 0, 6, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%647 = vector.extract %216[0, 13, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%648 = vector.insert %647, %646 [0, 0, 7, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%649 = vector.extract %216[0, 13, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%650 = vector.insert %649, %648 [0, 0, 8, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%651 = vector.extract %216[0, 13, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%652 = vector.insert %651, %650 [0, 0, 9, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%653 = vector.extract %216[0, 13, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%654 = vector.insert %653, %652 [0, 0, 10, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%655 = vector.extract %216[0, 13, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%656 = vector.insert %655, %654 [0, 0, 11, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%657 = vector.extract %216[0, 13, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%658 = vector.insert %657, %656 [0, 0, 12, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%659 = vector.extract %216[0, 13, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%660 = vector.insert %659, %658 [0, 0, 13, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%661 = vector.extract %216[0, 13, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%662 = vector.insert %661, %660 [0, 0, 14, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%663 = vector.extract %216[0, 13, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%664 = vector.insert %663, %662 [0, 0, 15, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%665 = vector.extract %216[0, 14, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%666 = vector.insert %665, %664 [0, 0, 0, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%667 = vector.extract %216[0, 14, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%668 = vector.insert %667, %666 [0, 0, 1, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%669 = vector.extract %216[0, 14, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%670 = vector.insert %669, %668 [0, 0, 2, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%671 = vector.extract %216[0, 14, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%672 = vector.insert %671, %670 [0, 0, 3, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%673 = vector.extract %216[0, 14, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%674 = vector.insert %673, %672 [0, 0, 4, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%675 = vector.extract %216[0, 14, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%676 = vector.insert %675, %674 [0, 0, 5, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%677 = vector.extract %216[0, 14, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%678 = vector.insert %677, %676 [0, 0, 6, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%679 = vector.extract %216[0, 14, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%680 = vector.insert %679, %678 [0, 0, 7, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%681 = vector.extract %216[0, 14, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%682 = vector.insert %681, %680 [0, 0, 8, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%683 = vector.extract %216[0, 14, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%684 = vector.insert %683, %682 [0, 0, 9, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%685 = vector.extract %216[0, 14, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%686 = vector.insert %685, %684 [0, 0, 10, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%687 = vector.extract %216[0, 14, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%688 = vector.insert %687, %686 [0, 0, 11, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%689 = vector.extract %216[0, 14, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%690 = vector.insert %689, %688 [0, 0, 12, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%691 = vector.extract %216[0, 14, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%692 = vector.insert %691, %690 [0, 0, 13, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%693 = vector.extract %216[0, 14, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%694 = vector.insert %693, %692 [0, 0, 14, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%695 = vector.extract %216[0, 14, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%696 = vector.insert %695, %694 [0, 0, 15, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%697 = vector.extract %216[0, 15, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%698 = vector.insert %697, %696 [0, 0, 0, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%699 = vector.extract %216[0, 15, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%700 = vector.insert %699, %698 [0, 0, 1, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%701 = vector.extract %216[0, 15, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%702 = vector.insert %701, %700 [0, 0, 2, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%703 = vector.extract %216[0, 15, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%704 = vector.insert %703, %702 [0, 0, 3, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%705 = vector.extract %216[0, 15, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%706 = vector.insert %705, %704 [0, 0, 4, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%707 = vector.extract %216[0, 15, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%708 = vector.insert %707, %706 [0, 0, 5, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%709 = vector.extract %216[0, 15, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%710 = vector.insert %709, %708 [0, 0, 6, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%711 = vector.extract %216[0, 15, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%712 = vector.insert %711, %710 [0, 0, 7, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%713 = vector.extract %216[0, 15, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%714 = vector.insert %713, %712 [0, 0, 8, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%715 = vector.extract %216[0, 15, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%716 = vector.insert %715, %714 [0, 0, 9, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%717 = vector.extract %216[0, 15, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%718 = vector.insert %717, %716 [0, 0, 10, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%719 = vector.extract %216[0, 15, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%720 = vector.insert %719, %718 [0, 0, 11, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%721 = vector.extract %216[0, 15, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%722 = vector.insert %721, %720 [0, 0, 12, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%723 = vector.extract %216[0, 15, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%724 = vector.insert %723, %722 [0, 0, 13, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%725 = vector.extract %216[0, 15, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%726 = vector.insert %725, %724 [0, 0, 14, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%727 = vector.extract %216[0, 15, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%728 = vector.insert %727, %726 [0, 0, 15, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%729 = vector.extract %216[1, 0, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%730 = vector.insert %729, %728 [0, 1, 0, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%731 = vector.extract %216[1, 0, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%732 = vector.insert %731, %730 [0, 1, 1, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%733 = vector.extract %216[1, 0, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%734 = vector.insert %733, %732 [0, 1, 2, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%735 = vector.extract %216[1, 0, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%736 = vector.insert %735, %734 [0, 1, 3, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%737 = vector.extract %216[1, 0, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%738 = vector.insert %737, %736 [0, 1, 4, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%739 = vector.extract %216[1, 0, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%740 = vector.insert %739, %738 [0, 1, 5, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%741 = vector.extract %216[1, 0, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%742 = vector.insert %741, %740 [0, 1, 6, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%743 = vector.extract %216[1, 0, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%744 = vector.insert %743, %742 [0, 1, 7, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%745 = vector.extract %216[1, 0, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%746 = vector.insert %745, %744 [0, 1, 8, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%747 = vector.extract %216[1, 0, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%748 = vector.insert %747, %746 [0, 1, 9, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%749 = vector.extract %216[1, 0, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%750 = vector.insert %749, %748 [0, 1, 10, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%751 = vector.extract %216[1, 0, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%752 = vector.insert %751, %750 [0, 1, 11, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%753 = vector.extract %216[1, 0, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%754 = vector.insert %753, %752 [0, 1, 12, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%755 = vector.extract %216[1, 0, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%756 = vector.insert %755, %754 [0, 1, 13, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%757 = vector.extract %216[1, 0, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%758 = vector.insert %757, %756 [0, 1, 14, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%759 = vector.extract %216[1, 0, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%760 = vector.insert %759, %758 [0, 1, 15, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%761 = vector.extract %216[1, 1, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%762 = vector.insert %761, %760 [0, 1, 0, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%763 = vector.extract %216[1, 1, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%764 = vector.insert %763, %762 [0, 1, 1, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%765 = vector.extract %216[1, 1, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%766 = vector.insert %765, %764 [0, 1, 2, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%767 = vector.extract %216[1, 1, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%768 = vector.insert %767, %766 [0, 1, 3, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%769 = vector.extract %216[1, 1, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%770 = vector.insert %769, %768 [0, 1, 4, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%771 = vector.extract %216[1, 1, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%772 = vector.insert %771, %770 [0, 1, 5, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%773 = vector.extract %216[1, 1, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%774 = vector.insert %773, %772 [0, 1, 6, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%775 = vector.extract %216[1, 1, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%776 = vector.insert %775, %774 [0, 1, 7, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%777 = vector.extract %216[1, 1, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%778 = vector.insert %777, %776 [0, 1, 8, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%779 = vector.extract %216[1, 1, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%780 = vector.insert %779, %778 [0, 1, 9, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%781 = vector.extract %216[1, 1, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%782 = vector.insert %781, %780 [0, 1, 10, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%783 = vector.extract %216[1, 1, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%784 = vector.insert %783, %782 [0, 1, 11, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%785 = vector.extract %216[1, 1, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%786 = vector.insert %785, %784 [0, 1, 12, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%787 = vector.extract %216[1, 1, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%788 = vector.insert %787, %786 [0, 1, 13, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%789 = vector.extract %216[1, 1, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%790 = vector.insert %789, %788 [0, 1, 14, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%791 = vector.extract %216[1, 1, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%792 = vector.insert %791, %790 [0, 1, 15, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%793 = vector.extract %216[1, 2, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%794 = vector.insert %793, %792 [0, 1, 0, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%795 = vector.extract %216[1, 2, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%796 = vector.insert %795, %794 [0, 1, 1, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%797 = vector.extract %216[1, 2, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%798 = vector.insert %797, %796 [0, 1, 2, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%799 = vector.extract %216[1, 2, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%800 = vector.insert %799, %798 [0, 1, 3, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%801 = vector.extract %216[1, 2, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%802 = vector.insert %801, %800 [0, 1, 4, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%803 = vector.extract %216[1, 2, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%804 = vector.insert %803, %802 [0, 1, 5, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%805 = vector.extract %216[1, 2, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%806 = vector.insert %805, %804 [0, 1, 6, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%807 = vector.extract %216[1, 2, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%808 = vector.insert %807, %806 [0, 1, 7, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%809 = vector.extract %216[1, 2, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%810 = vector.insert %809, %808 [0, 1, 8, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%811 = vector.extract %216[1, 2, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%812 = vector.insert %811, %810 [0, 1, 9, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%813 = vector.extract %216[1, 2, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%814 = vector.insert %813, %812 [0, 1, 10, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%815 = vector.extract %216[1, 2, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%816 = vector.insert %815, %814 [0, 1, 11, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%817 = vector.extract %216[1, 2, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%818 = vector.insert %817, %816 [0, 1, 12, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%819 = vector.extract %216[1, 2, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%820 = vector.insert %819, %818 [0, 1, 13, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%821 = vector.extract %216[1, 2, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%822 = vector.insert %821, %820 [0, 1, 14, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%823 = vector.extract %216[1, 2, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%824 = vector.insert %823, %822 [0, 1, 15, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%825 = vector.extract %216[1, 3, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%826 = vector.insert %825, %824 [0, 1, 0, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%827 = vector.extract %216[1, 3, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%828 = vector.insert %827, %826 [0, 1, 1, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%829 = vector.extract %216[1, 3, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%830 = vector.insert %829, %828 [0, 1, 2, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%831 = vector.extract %216[1, 3, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%832 = vector.insert %831, %830 [0, 1, 3, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%833 = vector.extract %216[1, 3, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%834 = vector.insert %833, %832 [0, 1, 4, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%835 = vector.extract %216[1, 3, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%836 = vector.insert %835, %834 [0, 1, 5, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%837 = vector.extract %216[1, 3, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%838 = vector.insert %837, %836 [0, 1, 6, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%839 = vector.extract %216[1, 3, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%840 = vector.insert %839, %838 [0, 1, 7, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%841 = vector.extract %216[1, 3, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%842 = vector.insert %841, %840 [0, 1, 8, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%843 = vector.extract %216[1, 3, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%844 = vector.insert %843, %842 [0, 1, 9, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%845 = vector.extract %216[1, 3, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%846 = vector.insert %845, %844 [0, 1, 10, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%847 = vector.extract %216[1, 3, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%848 = vector.insert %847, %846 [0, 1, 11, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%849 = vector.extract %216[1, 3, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%850 = vector.insert %849, %848 [0, 1, 12, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%851 = vector.extract %216[1, 3, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%852 = vector.insert %851, %850 [0, 1, 13, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%853 = vector.extract %216[1, 3, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%854 = vector.insert %853, %852 [0, 1, 14, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%855 = vector.extract %216[1, 3, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%856 = vector.insert %855, %854 [0, 1, 15, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%857 = vector.extract %216[1, 4, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%858 = vector.insert %857, %856 [0, 1, 0, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%859 = vector.extract %216[1, 4, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%860 = vector.insert %859, %858 [0, 1, 1, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%861 = vector.extract %216[1, 4, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%862 = vector.insert %861, %860 [0, 1, 2, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%863 = vector.extract %216[1, 4, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%864 = vector.insert %863, %862 [0, 1, 3, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%865 = vector.extract %216[1, 4, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%866 = vector.insert %865, %864 [0, 1, 4, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%867 = vector.extract %216[1, 4, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%868 = vector.insert %867, %866 [0, 1, 5, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%869 = vector.extract %216[1, 4, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%870 = vector.insert %869, %868 [0, 1, 6, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%871 = vector.extract %216[1, 4, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%872 = vector.insert %871, %870 [0, 1, 7, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%873 = vector.extract %216[1, 4, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%874 = vector.insert %873, %872 [0, 1, 8, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%875 = vector.extract %216[1, 4, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%876 = vector.insert %875, %874 [0, 1, 9, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%877 = vector.extract %216[1, 4, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%878 = vector.insert %877, %876 [0, 1, 10, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%879 = vector.extract %216[1, 4, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%880 = vector.insert %879, %878 [0, 1, 11, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%881 = vector.extract %216[1, 4, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%882 = vector.insert %881, %880 [0, 1, 12, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%883 = vector.extract %216[1, 4, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%884 = vector.insert %883, %882 [0, 1, 13, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%885 = vector.extract %216[1, 4, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%886 = vector.insert %885, %884 [0, 1, 14, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%887 = vector.extract %216[1, 4, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%888 = vector.insert %887, %886 [0, 1, 15, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%889 = vector.extract %216[1, 5, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%890 = vector.insert %889, %888 [0, 1, 0, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%891 = vector.extract %216[1, 5, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%892 = vector.insert %891, %890 [0, 1, 1, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%893 = vector.extract %216[1, 5, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%894 = vector.insert %893, %892 [0, 1, 2, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%895 = vector.extract %216[1, 5, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%896 = vector.insert %895, %894 [0, 1, 3, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%897 = vector.extract %216[1, 5, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%898 = vector.insert %897, %896 [0, 1, 4, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%899 = vector.extract %216[1, 5, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%900 = vector.insert %899, %898 [0, 1, 5, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%901 = vector.extract %216[1, 5, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%902 = vector.insert %901, %900 [0, 1, 6, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%903 = vector.extract %216[1, 5, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%904 = vector.insert %903, %902 [0, 1, 7, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%905 = vector.extract %216[1, 5, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%906 = vector.insert %905, %904 [0, 1, 8, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%907 = vector.extract %216[1, 5, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%908 = vector.insert %907, %906 [0, 1, 9, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%909 = vector.extract %216[1, 5, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%910 = vector.insert %909, %908 [0, 1, 10, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%911 = vector.extract %216[1, 5, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%912 = vector.insert %911, %910 [0, 1, 11, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%913 = vector.extract %216[1, 5, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%914 = vector.insert %913, %912 [0, 1, 12, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%915 = vector.extract %216[1, 5, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%916 = vector.insert %915, %914 [0, 1, 13, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%917 = vector.extract %216[1, 5, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%918 = vector.insert %917, %916 [0, 1, 14, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%919 = vector.extract %216[1, 5, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%920 = vector.insert %919, %918 [0, 1, 15, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%921 = vector.extract %216[1, 6, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%922 = vector.insert %921, %920 [0, 1, 0, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%923 = vector.extract %216[1, 6, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%924 = vector.insert %923, %922 [0, 1, 1, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%925 = vector.extract %216[1, 6, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%926 = vector.insert %925, %924 [0, 1, 2, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%927 = vector.extract %216[1, 6, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%928 = vector.insert %927, %926 [0, 1, 3, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%929 = vector.extract %216[1, 6, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%930 = vector.insert %929, %928 [0, 1, 4, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%931 = vector.extract %216[1, 6, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%932 = vector.insert %931, %930 [0, 1, 5, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%933 = vector.extract %216[1, 6, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%934 = vector.insert %933, %932 [0, 1, 6, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%935 = vector.extract %216[1, 6, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%936 = vector.insert %935, %934 [0, 1, 7, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%937 = vector.extract %216[1, 6, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%938 = vector.insert %937, %936 [0, 1, 8, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%939 = vector.extract %216[1, 6, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%940 = vector.insert %939, %938 [0, 1, 9, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%941 = vector.extract %216[1, 6, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%942 = vector.insert %941, %940 [0, 1, 10, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%943 = vector.extract %216[1, 6, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%944 = vector.insert %943, %942 [0, 1, 11, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%945 = vector.extract %216[1, 6, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%946 = vector.insert %945, %944 [0, 1, 12, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%947 = vector.extract %216[1, 6, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%948 = vector.insert %947, %946 [0, 1, 13, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%949 = vector.extract %216[1, 6, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%950 = vector.insert %949, %948 [0, 1, 14, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%951 = vector.extract %216[1, 6, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%952 = vector.insert %951, %950 [0, 1, 15, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%953 = vector.extract %216[1, 7, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%954 = vector.insert %953, %952 [0, 1, 0, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%955 = vector.extract %216[1, 7, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%956 = vector.insert %955, %954 [0, 1, 1, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%957 = vector.extract %216[1, 7, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%958 = vector.insert %957, %956 [0, 1, 2, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%959 = vector.extract %216[1, 7, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%960 = vector.insert %959, %958 [0, 1, 3, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%961 = vector.extract %216[1, 7, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%962 = vector.insert %961, %960 [0, 1, 4, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%963 = vector.extract %216[1, 7, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%964 = vector.insert %963, %962 [0, 1, 5, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%965 = vector.extract %216[1, 7, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%966 = vector.insert %965, %964 [0, 1, 6, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%967 = vector.extract %216[1, 7, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%968 = vector.insert %967, %966 [0, 1, 7, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%969 = vector.extract %216[1, 7, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%970 = vector.insert %969, %968 [0, 1, 8, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%971 = vector.extract %216[1, 7, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%972 = vector.insert %971, %970 [0, 1, 9, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%973 = vector.extract %216[1, 7, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%974 = vector.insert %973, %972 [0, 1, 10, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%975 = vector.extract %216[1, 7, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%976 = vector.insert %975, %974 [0, 1, 11, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%977 = vector.extract %216[1, 7, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%978 = vector.insert %977, %976 [0, 1, 12, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%979 = vector.extract %216[1, 7, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%980 = vector.insert %979, %978 [0, 1, 13, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%981 = vector.extract %216[1, 7, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%982 = vector.insert %981, %980 [0, 1, 14, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%983 = vector.extract %216[1, 7, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%984 = vector.insert %983, %982 [0, 1, 15, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%985 = vector.extract %216[1, 8, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%986 = vector.insert %985, %984 [0, 1, 0, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%987 = vector.extract %216[1, 8, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%988 = vector.insert %987, %986 [0, 1, 1, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%989 = vector.extract %216[1, 8, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%990 = vector.insert %989, %988 [0, 1, 2, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%991 = vector.extract %216[1, 8, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%992 = vector.insert %991, %990 [0, 1, 3, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%993 = vector.extract %216[1, 8, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%994 = vector.insert %993, %992 [0, 1, 4, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%995 = vector.extract %216[1, 8, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%996 = vector.insert %995, %994 [0, 1, 5, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%997 = vector.extract %216[1, 8, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%998 = vector.insert %997, %996 [0, 1, 6, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%999 = vector.extract %216[1, 8, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1000 = vector.insert %999, %998 [0, 1, 7, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1001 = vector.extract %216[1, 8, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1002 = vector.insert %1001, %1000 [0, 1, 8, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1003 = vector.extract %216[1, 8, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1004 = vector.insert %1003, %1002 [0, 1, 9, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1005 = vector.extract %216[1, 8, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1006 = vector.insert %1005, %1004 [0, 1, 10, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1007 = vector.extract %216[1, 8, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1008 = vector.insert %1007, %1006 [0, 1, 11, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1009 = vector.extract %216[1, 8, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1010 = vector.insert %1009, %1008 [0, 1, 12, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1011 = vector.extract %216[1, 8, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1012 = vector.insert %1011, %1010 [0, 1, 13, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1013 = vector.extract %216[1, 8, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1014 = vector.insert %1013, %1012 [0, 1, 14, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1015 = vector.extract %216[1, 8, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1016 = vector.insert %1015, %1014 [0, 1, 15, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1017 = vector.extract %216[1, 9, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1018 = vector.insert %1017, %1016 [0, 1, 0, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1019 = vector.extract %216[1, 9, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1020 = vector.insert %1019, %1018 [0, 1, 1, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1021 = vector.extract %216[1, 9, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1022 = vector.insert %1021, %1020 [0, 1, 2, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1023 = vector.extract %216[1, 9, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1024 = vector.insert %1023, %1022 [0, 1, 3, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1025 = vector.extract %216[1, 9, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1026 = vector.insert %1025, %1024 [0, 1, 4, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1027 = vector.extract %216[1, 9, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1028 = vector.insert %1027, %1026 [0, 1, 5, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1029 = vector.extract %216[1, 9, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1030 = vector.insert %1029, %1028 [0, 1, 6, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1031 = vector.extract %216[1, 9, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1032 = vector.insert %1031, %1030 [0, 1, 7, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1033 = vector.extract %216[1, 9, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1034 = vector.insert %1033, %1032 [0, 1, 8, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1035 = vector.extract %216[1, 9, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1036 = vector.insert %1035, %1034 [0, 1, 9, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1037 = vector.extract %216[1, 9, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1038 = vector.insert %1037, %1036 [0, 1, 10, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1039 = vector.extract %216[1, 9, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1040 = vector.insert %1039, %1038 [0, 1, 11, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1041 = vector.extract %216[1, 9, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1042 = vector.insert %1041, %1040 [0, 1, 12, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1043 = vector.extract %216[1, 9, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1044 = vector.insert %1043, %1042 [0, 1, 13, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1045 = vector.extract %216[1, 9, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1046 = vector.insert %1045, %1044 [0, 1, 14, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1047 = vector.extract %216[1, 9, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1048 = vector.insert %1047, %1046 [0, 1, 15, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1049 = vector.extract %216[1, 10, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1050 = vector.insert %1049, %1048 [0, 1, 0, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1051 = vector.extract %216[1, 10, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1052 = vector.insert %1051, %1050 [0, 1, 1, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1053 = vector.extract %216[1, 10, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1054 = vector.insert %1053, %1052 [0, 1, 2, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1055 = vector.extract %216[1, 10, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1056 = vector.insert %1055, %1054 [0, 1, 3, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1057 = vector.extract %216[1, 10, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1058 = vector.insert %1057, %1056 [0, 1, 4, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1059 = vector.extract %216[1, 10, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1060 = vector.insert %1059, %1058 [0, 1, 5, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1061 = vector.extract %216[1, 10, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1062 = vector.insert %1061, %1060 [0, 1, 6, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1063 = vector.extract %216[1, 10, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1064 = vector.insert %1063, %1062 [0, 1, 7, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1065 = vector.extract %216[1, 10, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1066 = vector.insert %1065, %1064 [0, 1, 8, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1067 = vector.extract %216[1, 10, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1068 = vector.insert %1067, %1066 [0, 1, 9, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1069 = vector.extract %216[1, 10, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1070 = vector.insert %1069, %1068 [0, 1, 10, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1071 = vector.extract %216[1, 10, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1072 = vector.insert %1071, %1070 [0, 1, 11, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1073 = vector.extract %216[1, 10, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1074 = vector.insert %1073, %1072 [0, 1, 12, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1075 = vector.extract %216[1, 10, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1076 = vector.insert %1075, %1074 [0, 1, 13, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1077 = vector.extract %216[1, 10, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1078 = vector.insert %1077, %1076 [0, 1, 14, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1079 = vector.extract %216[1, 10, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1080 = vector.insert %1079, %1078 [0, 1, 15, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1081 = vector.extract %216[1, 11, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1082 = vector.insert %1081, %1080 [0, 1, 0, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1083 = vector.extract %216[1, 11, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1084 = vector.insert %1083, %1082 [0, 1, 1, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1085 = vector.extract %216[1, 11, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1086 = vector.insert %1085, %1084 [0, 1, 2, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1087 = vector.extract %216[1, 11, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1088 = vector.insert %1087, %1086 [0, 1, 3, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1089 = vector.extract %216[1, 11, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1090 = vector.insert %1089, %1088 [0, 1, 4, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1091 = vector.extract %216[1, 11, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1092 = vector.insert %1091, %1090 [0, 1, 5, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1093 = vector.extract %216[1, 11, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1094 = vector.insert %1093, %1092 [0, 1, 6, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1095 = vector.extract %216[1, 11, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1096 = vector.insert %1095, %1094 [0, 1, 7, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1097 = vector.extract %216[1, 11, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1098 = vector.insert %1097, %1096 [0, 1, 8, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1099 = vector.extract %216[1, 11, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1100 = vector.insert %1099, %1098 [0, 1, 9, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1101 = vector.extract %216[1, 11, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1102 = vector.insert %1101, %1100 [0, 1, 10, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1103 = vector.extract %216[1, 11, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1104 = vector.insert %1103, %1102 [0, 1, 11, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1105 = vector.extract %216[1, 11, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1106 = vector.insert %1105, %1104 [0, 1, 12, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1107 = vector.extract %216[1, 11, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1108 = vector.insert %1107, %1106 [0, 1, 13, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1109 = vector.extract %216[1, 11, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1110 = vector.insert %1109, %1108 [0, 1, 14, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1111 = vector.extract %216[1, 11, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1112 = vector.insert %1111, %1110 [0, 1, 15, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1113 = vector.extract %216[1, 12, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1114 = vector.insert %1113, %1112 [0, 1, 0, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1115 = vector.extract %216[1, 12, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1116 = vector.insert %1115, %1114 [0, 1, 1, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1117 = vector.extract %216[1, 12, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1118 = vector.insert %1117, %1116 [0, 1, 2, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1119 = vector.extract %216[1, 12, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1120 = vector.insert %1119, %1118 [0, 1, 3, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1121 = vector.extract %216[1, 12, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1122 = vector.insert %1121, %1120 [0, 1, 4, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1123 = vector.extract %216[1, 12, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1124 = vector.insert %1123, %1122 [0, 1, 5, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1125 = vector.extract %216[1, 12, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1126 = vector.insert %1125, %1124 [0, 1, 6, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1127 = vector.extract %216[1, 12, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1128 = vector.insert %1127, %1126 [0, 1, 7, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1129 = vector.extract %216[1, 12, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1130 = vector.insert %1129, %1128 [0, 1, 8, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1131 = vector.extract %216[1, 12, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1132 = vector.insert %1131, %1130 [0, 1, 9, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1133 = vector.extract %216[1, 12, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1134 = vector.insert %1133, %1132 [0, 1, 10, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1135 = vector.extract %216[1, 12, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1136 = vector.insert %1135, %1134 [0, 1, 11, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1137 = vector.extract %216[1, 12, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1138 = vector.insert %1137, %1136 [0, 1, 12, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1139 = vector.extract %216[1, 12, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1140 = vector.insert %1139, %1138 [0, 1, 13, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1141 = vector.extract %216[1, 12, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1142 = vector.insert %1141, %1140 [0, 1, 14, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1143 = vector.extract %216[1, 12, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1144 = vector.insert %1143, %1142 [0, 1, 15, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1145 = vector.extract %216[1, 13, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1146 = vector.insert %1145, %1144 [0, 1, 0, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1147 = vector.extract %216[1, 13, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1148 = vector.insert %1147, %1146 [0, 1, 1, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1149 = vector.extract %216[1, 13, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1150 = vector.insert %1149, %1148 [0, 1, 2, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1151 = vector.extract %216[1, 13, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1152 = vector.insert %1151, %1150 [0, 1, 3, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1153 = vector.extract %216[1, 13, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1154 = vector.insert %1153, %1152 [0, 1, 4, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1155 = vector.extract %216[1, 13, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1156 = vector.insert %1155, %1154 [0, 1, 5, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1157 = vector.extract %216[1, 13, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1158 = vector.insert %1157, %1156 [0, 1, 6, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1159 = vector.extract %216[1, 13, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1160 = vector.insert %1159, %1158 [0, 1, 7, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1161 = vector.extract %216[1, 13, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1162 = vector.insert %1161, %1160 [0, 1, 8, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1163 = vector.extract %216[1, 13, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1164 = vector.insert %1163, %1162 [0, 1, 9, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1165 = vector.extract %216[1, 13, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1166 = vector.insert %1165, %1164 [0, 1, 10, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1167 = vector.extract %216[1, 13, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1168 = vector.insert %1167, %1166 [0, 1, 11, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1169 = vector.extract %216[1, 13, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1170 = vector.insert %1169, %1168 [0, 1, 12, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1171 = vector.extract %216[1, 13, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1172 = vector.insert %1171, %1170 [0, 1, 13, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1173 = vector.extract %216[1, 13, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1174 = vector.insert %1173, %1172 [0, 1, 14, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1175 = vector.extract %216[1, 13, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1176 = vector.insert %1175, %1174 [0, 1, 15, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1177 = vector.extract %216[1, 14, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1178 = vector.insert %1177, %1176 [0, 1, 0, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1179 = vector.extract %216[1, 14, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1180 = vector.insert %1179, %1178 [0, 1, 1, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1181 = vector.extract %216[1, 14, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1182 = vector.insert %1181, %1180 [0, 1, 2, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1183 = vector.extract %216[1, 14, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1184 = vector.insert %1183, %1182 [0, 1, 3, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1185 = vector.extract %216[1, 14, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1186 = vector.insert %1185, %1184 [0, 1, 4, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1187 = vector.extract %216[1, 14, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1188 = vector.insert %1187, %1186 [0, 1, 5, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1189 = vector.extract %216[1, 14, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1190 = vector.insert %1189, %1188 [0, 1, 6, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1191 = vector.extract %216[1, 14, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1192 = vector.insert %1191, %1190 [0, 1, 7, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1193 = vector.extract %216[1, 14, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1194 = vector.insert %1193, %1192 [0, 1, 8, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1195 = vector.extract %216[1, 14, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1196 = vector.insert %1195, %1194 [0, 1, 9, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1197 = vector.extract %216[1, 14, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1198 = vector.insert %1197, %1196 [0, 1, 10, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1199 = vector.extract %216[1, 14, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1200 = vector.insert %1199, %1198 [0, 1, 11, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1201 = vector.extract %216[1, 14, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1202 = vector.insert %1201, %1200 [0, 1, 12, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1203 = vector.extract %216[1, 14, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1204 = vector.insert %1203, %1202 [0, 1, 13, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1205 = vector.extract %216[1, 14, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1206 = vector.insert %1205, %1204 [0, 1, 14, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1207 = vector.extract %216[1, 14, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1208 = vector.insert %1207, %1206 [0, 1, 15, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1209 = vector.extract %216[1, 15, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1210 = vector.insert %1209, %1208 [0, 1, 0, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1211 = vector.extract %216[1, 15, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1212 = vector.insert %1211, %1210 [0, 1, 1, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1213 = vector.extract %216[1, 15, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1214 = vector.insert %1213, %1212 [0, 1, 2, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1215 = vector.extract %216[1, 15, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1216 = vector.insert %1215, %1214 [0, 1, 3, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1217 = vector.extract %216[1, 15, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1218 = vector.insert %1217, %1216 [0, 1, 4, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1219 = vector.extract %216[1, 15, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1220 = vector.insert %1219, %1218 [0, 1, 5, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1221 = vector.extract %216[1, 15, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1222 = vector.insert %1221, %1220 [0, 1, 6, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1223 = vector.extract %216[1, 15, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1224 = vector.insert %1223, %1222 [0, 1, 7, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1225 = vector.extract %216[1, 15, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1226 = vector.insert %1225, %1224 [0, 1, 8, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1227 = vector.extract %216[1, 15, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1228 = vector.insert %1227, %1226 [0, 1, 9, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1229 = vector.extract %216[1, 15, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1230 = vector.insert %1229, %1228 [0, 1, 10, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1231 = vector.extract %216[1, 15, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1232 = vector.insert %1231, %1230 [0, 1, 11, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1233 = vector.extract %216[1, 15, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1234 = vector.insert %1233, %1232 [0, 1, 12, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1235 = vector.extract %216[1, 15, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1236 = vector.insert %1235, %1234 [0, 1, 13, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1237 = vector.extract %216[1, 15, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1238 = vector.insert %1237, %1236 [0, 1, 14, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1239 = vector.extract %216[1, 15, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1240 = vector.insert %1239, %1238 [0, 1, 15, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1241 = vector.extract %216[2, 0, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1242 = vector.insert %1241, %1240 [0, 2, 0, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1243 = vector.extract %216[2, 0, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1244 = vector.insert %1243, %1242 [0, 2, 1, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1245 = vector.extract %216[2, 0, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1246 = vector.insert %1245, %1244 [0, 2, 2, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1247 = vector.extract %216[2, 0, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1248 = vector.insert %1247, %1246 [0, 2, 3, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1249 = vector.extract %216[2, 0, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1250 = vector.insert %1249, %1248 [0, 2, 4, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1251 = vector.extract %216[2, 0, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1252 = vector.insert %1251, %1250 [0, 2, 5, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1253 = vector.extract %216[2, 0, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1254 = vector.insert %1253, %1252 [0, 2, 6, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1255 = vector.extract %216[2, 0, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1256 = vector.insert %1255, %1254 [0, 2, 7, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1257 = vector.extract %216[2, 0, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1258 = vector.insert %1257, %1256 [0, 2, 8, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1259 = vector.extract %216[2, 0, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1260 = vector.insert %1259, %1258 [0, 2, 9, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1261 = vector.extract %216[2, 0, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1262 = vector.insert %1261, %1260 [0, 2, 10, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1263 = vector.extract %216[2, 0, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1264 = vector.insert %1263, %1262 [0, 2, 11, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1265 = vector.extract %216[2, 0, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1266 = vector.insert %1265, %1264 [0, 2, 12, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1267 = vector.extract %216[2, 0, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1268 = vector.insert %1267, %1266 [0, 2, 13, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1269 = vector.extract %216[2, 0, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1270 = vector.insert %1269, %1268 [0, 2, 14, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1271 = vector.extract %216[2, 0, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1272 = vector.insert %1271, %1270 [0, 2, 15, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1273 = vector.extract %216[2, 1, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1274 = vector.insert %1273, %1272 [0, 2, 0, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1275 = vector.extract %216[2, 1, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1276 = vector.insert %1275, %1274 [0, 2, 1, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1277 = vector.extract %216[2, 1, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1278 = vector.insert %1277, %1276 [0, 2, 2, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1279 = vector.extract %216[2, 1, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1280 = vector.insert %1279, %1278 [0, 2, 3, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1281 = vector.extract %216[2, 1, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1282 = vector.insert %1281, %1280 [0, 2, 4, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1283 = vector.extract %216[2, 1, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1284 = vector.insert %1283, %1282 [0, 2, 5, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1285 = vector.extract %216[2, 1, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1286 = vector.insert %1285, %1284 [0, 2, 6, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1287 = vector.extract %216[2, 1, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1288 = vector.insert %1287, %1286 [0, 2, 7, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1289 = vector.extract %216[2, 1, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1290 = vector.insert %1289, %1288 [0, 2, 8, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1291 = vector.extract %216[2, 1, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1292 = vector.insert %1291, %1290 [0, 2, 9, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1293 = vector.extract %216[2, 1, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1294 = vector.insert %1293, %1292 [0, 2, 10, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1295 = vector.extract %216[2, 1, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1296 = vector.insert %1295, %1294 [0, 2, 11, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1297 = vector.extract %216[2, 1, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1298 = vector.insert %1297, %1296 [0, 2, 12, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1299 = vector.extract %216[2, 1, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1300 = vector.insert %1299, %1298 [0, 2, 13, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1301 = vector.extract %216[2, 1, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1302 = vector.insert %1301, %1300 [0, 2, 14, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1303 = vector.extract %216[2, 1, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1304 = vector.insert %1303, %1302 [0, 2, 15, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1305 = vector.extract %216[2, 2, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1306 = vector.insert %1305, %1304 [0, 2, 0, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1307 = vector.extract %216[2, 2, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1308 = vector.insert %1307, %1306 [0, 2, 1, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1309 = vector.extract %216[2, 2, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1310 = vector.insert %1309, %1308 [0, 2, 2, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1311 = vector.extract %216[2, 2, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1312 = vector.insert %1311, %1310 [0, 2, 3, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1313 = vector.extract %216[2, 2, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1314 = vector.insert %1313, %1312 [0, 2, 4, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1315 = vector.extract %216[2, 2, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1316 = vector.insert %1315, %1314 [0, 2, 5, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1317 = vector.extract %216[2, 2, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1318 = vector.insert %1317, %1316 [0, 2, 6, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1319 = vector.extract %216[2, 2, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1320 = vector.insert %1319, %1318 [0, 2, 7, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1321 = vector.extract %216[2, 2, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1322 = vector.insert %1321, %1320 [0, 2, 8, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1323 = vector.extract %216[2, 2, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1324 = vector.insert %1323, %1322 [0, 2, 9, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1325 = vector.extract %216[2, 2, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1326 = vector.insert %1325, %1324 [0, 2, 10, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1327 = vector.extract %216[2, 2, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1328 = vector.insert %1327, %1326 [0, 2, 11, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1329 = vector.extract %216[2, 2, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1330 = vector.insert %1329, %1328 [0, 2, 12, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1331 = vector.extract %216[2, 2, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1332 = vector.insert %1331, %1330 [0, 2, 13, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1333 = vector.extract %216[2, 2, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1334 = vector.insert %1333, %1332 [0, 2, 14, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1335 = vector.extract %216[2, 2, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1336 = vector.insert %1335, %1334 [0, 2, 15, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1337 = vector.extract %216[2, 3, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1338 = vector.insert %1337, %1336 [0, 2, 0, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1339 = vector.extract %216[2, 3, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1340 = vector.insert %1339, %1338 [0, 2, 1, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1341 = vector.extract %216[2, 3, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1342 = vector.insert %1341, %1340 [0, 2, 2, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1343 = vector.extract %216[2, 3, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1344 = vector.insert %1343, %1342 [0, 2, 3, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1345 = vector.extract %216[2, 3, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1346 = vector.insert %1345, %1344 [0, 2, 4, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1347 = vector.extract %216[2, 3, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1348 = vector.insert %1347, %1346 [0, 2, 5, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1349 = vector.extract %216[2, 3, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1350 = vector.insert %1349, %1348 [0, 2, 6, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1351 = vector.extract %216[2, 3, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1352 = vector.insert %1351, %1350 [0, 2, 7, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1353 = vector.extract %216[2, 3, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1354 = vector.insert %1353, %1352 [0, 2, 8, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1355 = vector.extract %216[2, 3, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1356 = vector.insert %1355, %1354 [0, 2, 9, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1357 = vector.extract %216[2, 3, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1358 = vector.insert %1357, %1356 [0, 2, 10, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1359 = vector.extract %216[2, 3, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1360 = vector.insert %1359, %1358 [0, 2, 11, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1361 = vector.extract %216[2, 3, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1362 = vector.insert %1361, %1360 [0, 2, 12, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1363 = vector.extract %216[2, 3, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1364 = vector.insert %1363, %1362 [0, 2, 13, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1365 = vector.extract %216[2, 3, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1366 = vector.insert %1365, %1364 [0, 2, 14, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1367 = vector.extract %216[2, 3, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1368 = vector.insert %1367, %1366 [0, 2, 15, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1369 = vector.extract %216[2, 4, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1370 = vector.insert %1369, %1368 [0, 2, 0, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1371 = vector.extract %216[2, 4, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1372 = vector.insert %1371, %1370 [0, 2, 1, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1373 = vector.extract %216[2, 4, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1374 = vector.insert %1373, %1372 [0, 2, 2, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1375 = vector.extract %216[2, 4, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1376 = vector.insert %1375, %1374 [0, 2, 3, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1377 = vector.extract %216[2, 4, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1378 = vector.insert %1377, %1376 [0, 2, 4, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1379 = vector.extract %216[2, 4, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1380 = vector.insert %1379, %1378 [0, 2, 5, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1381 = vector.extract %216[2, 4, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1382 = vector.insert %1381, %1380 [0, 2, 6, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1383 = vector.extract %216[2, 4, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1384 = vector.insert %1383, %1382 [0, 2, 7, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1385 = vector.extract %216[2, 4, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1386 = vector.insert %1385, %1384 [0, 2, 8, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1387 = vector.extract %216[2, 4, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1388 = vector.insert %1387, %1386 [0, 2, 9, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1389 = vector.extract %216[2, 4, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1390 = vector.insert %1389, %1388 [0, 2, 10, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1391 = vector.extract %216[2, 4, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1392 = vector.insert %1391, %1390 [0, 2, 11, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1393 = vector.extract %216[2, 4, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1394 = vector.insert %1393, %1392 [0, 2, 12, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1395 = vector.extract %216[2, 4, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1396 = vector.insert %1395, %1394 [0, 2, 13, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1397 = vector.extract %216[2, 4, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1398 = vector.insert %1397, %1396 [0, 2, 14, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1399 = vector.extract %216[2, 4, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1400 = vector.insert %1399, %1398 [0, 2, 15, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1401 = vector.extract %216[2, 5, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1402 = vector.insert %1401, %1400 [0, 2, 0, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1403 = vector.extract %216[2, 5, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1404 = vector.insert %1403, %1402 [0, 2, 1, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1405 = vector.extract %216[2, 5, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1406 = vector.insert %1405, %1404 [0, 2, 2, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1407 = vector.extract %216[2, 5, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1408 = vector.insert %1407, %1406 [0, 2, 3, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1409 = vector.extract %216[2, 5, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1410 = vector.insert %1409, %1408 [0, 2, 4, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1411 = vector.extract %216[2, 5, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1412 = vector.insert %1411, %1410 [0, 2, 5, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1413 = vector.extract %216[2, 5, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1414 = vector.insert %1413, %1412 [0, 2, 6, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1415 = vector.extract %216[2, 5, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1416 = vector.insert %1415, %1414 [0, 2, 7, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1417 = vector.extract %216[2, 5, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1418 = vector.insert %1417, %1416 [0, 2, 8, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1419 = vector.extract %216[2, 5, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1420 = vector.insert %1419, %1418 [0, 2, 9, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1421 = vector.extract %216[2, 5, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1422 = vector.insert %1421, %1420 [0, 2, 10, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1423 = vector.extract %216[2, 5, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1424 = vector.insert %1423, %1422 [0, 2, 11, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1425 = vector.extract %216[2, 5, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1426 = vector.insert %1425, %1424 [0, 2, 12, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1427 = vector.extract %216[2, 5, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1428 = vector.insert %1427, %1426 [0, 2, 13, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1429 = vector.extract %216[2, 5, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1430 = vector.insert %1429, %1428 [0, 2, 14, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1431 = vector.extract %216[2, 5, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1432 = vector.insert %1431, %1430 [0, 2, 15, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1433 = vector.extract %216[2, 6, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1434 = vector.insert %1433, %1432 [0, 2, 0, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1435 = vector.extract %216[2, 6, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1436 = vector.insert %1435, %1434 [0, 2, 1, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1437 = vector.extract %216[2, 6, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1438 = vector.insert %1437, %1436 [0, 2, 2, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1439 = vector.extract %216[2, 6, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1440 = vector.insert %1439, %1438 [0, 2, 3, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1441 = vector.extract %216[2, 6, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1442 = vector.insert %1441, %1440 [0, 2, 4, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1443 = vector.extract %216[2, 6, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1444 = vector.insert %1443, %1442 [0, 2, 5, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1445 = vector.extract %216[2, 6, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1446 = vector.insert %1445, %1444 [0, 2, 6, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1447 = vector.extract %216[2, 6, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1448 = vector.insert %1447, %1446 [0, 2, 7, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1449 = vector.extract %216[2, 6, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1450 = vector.insert %1449, %1448 [0, 2, 8, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1451 = vector.extract %216[2, 6, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1452 = vector.insert %1451, %1450 [0, 2, 9, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1453 = vector.extract %216[2, 6, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1454 = vector.insert %1453, %1452 [0, 2, 10, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1455 = vector.extract %216[2, 6, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1456 = vector.insert %1455, %1454 [0, 2, 11, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1457 = vector.extract %216[2, 6, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1458 = vector.insert %1457, %1456 [0, 2, 12, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1459 = vector.extract %216[2, 6, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1460 = vector.insert %1459, %1458 [0, 2, 13, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1461 = vector.extract %216[2, 6, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1462 = vector.insert %1461, %1460 [0, 2, 14, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1463 = vector.extract %216[2, 6, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1464 = vector.insert %1463, %1462 [0, 2, 15, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1465 = vector.extract %216[2, 7, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1466 = vector.insert %1465, %1464 [0, 2, 0, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1467 = vector.extract %216[2, 7, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1468 = vector.insert %1467, %1466 [0, 2, 1, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1469 = vector.extract %216[2, 7, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1470 = vector.insert %1469, %1468 [0, 2, 2, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1471 = vector.extract %216[2, 7, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1472 = vector.insert %1471, %1470 [0, 2, 3, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1473 = vector.extract %216[2, 7, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1474 = vector.insert %1473, %1472 [0, 2, 4, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1475 = vector.extract %216[2, 7, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1476 = vector.insert %1475, %1474 [0, 2, 5, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1477 = vector.extract %216[2, 7, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1478 = vector.insert %1477, %1476 [0, 2, 6, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1479 = vector.extract %216[2, 7, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1480 = vector.insert %1479, %1478 [0, 2, 7, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1481 = vector.extract %216[2, 7, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1482 = vector.insert %1481, %1480 [0, 2, 8, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1483 = vector.extract %216[2, 7, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1484 = vector.insert %1483, %1482 [0, 2, 9, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1485 = vector.extract %216[2, 7, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1486 = vector.insert %1485, %1484 [0, 2, 10, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1487 = vector.extract %216[2, 7, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1488 = vector.insert %1487, %1486 [0, 2, 11, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1489 = vector.extract %216[2, 7, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1490 = vector.insert %1489, %1488 [0, 2, 12, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1491 = vector.extract %216[2, 7, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1492 = vector.insert %1491, %1490 [0, 2, 13, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1493 = vector.extract %216[2, 7, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1494 = vector.insert %1493, %1492 [0, 2, 14, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1495 = vector.extract %216[2, 7, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1496 = vector.insert %1495, %1494 [0, 2, 15, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1497 = vector.extract %216[2, 8, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1498 = vector.insert %1497, %1496 [0, 2, 0, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1499 = vector.extract %216[2, 8, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1500 = vector.insert %1499, %1498 [0, 2, 1, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1501 = vector.extract %216[2, 8, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1502 = vector.insert %1501, %1500 [0, 2, 2, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1503 = vector.extract %216[2, 8, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1504 = vector.insert %1503, %1502 [0, 2, 3, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1505 = vector.extract %216[2, 8, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1506 = vector.insert %1505, %1504 [0, 2, 4, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1507 = vector.extract %216[2, 8, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1508 = vector.insert %1507, %1506 [0, 2, 5, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1509 = vector.extract %216[2, 8, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1510 = vector.insert %1509, %1508 [0, 2, 6, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1511 = vector.extract %216[2, 8, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1512 = vector.insert %1511, %1510 [0, 2, 7, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1513 = vector.extract %216[2, 8, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1514 = vector.insert %1513, %1512 [0, 2, 8, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1515 = vector.extract %216[2, 8, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1516 = vector.insert %1515, %1514 [0, 2, 9, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1517 = vector.extract %216[2, 8, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1518 = vector.insert %1517, %1516 [0, 2, 10, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1519 = vector.extract %216[2, 8, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1520 = vector.insert %1519, %1518 [0, 2, 11, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1521 = vector.extract %216[2, 8, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1522 = vector.insert %1521, %1520 [0, 2, 12, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1523 = vector.extract %216[2, 8, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1524 = vector.insert %1523, %1522 [0, 2, 13, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1525 = vector.extract %216[2, 8, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1526 = vector.insert %1525, %1524 [0, 2, 14, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1527 = vector.extract %216[2, 8, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1528 = vector.insert %1527, %1526 [0, 2, 15, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1529 = vector.extract %216[2, 9, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1530 = vector.insert %1529, %1528 [0, 2, 0, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1531 = vector.extract %216[2, 9, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1532 = vector.insert %1531, %1530 [0, 2, 1, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1533 = vector.extract %216[2, 9, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1534 = vector.insert %1533, %1532 [0, 2, 2, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1535 = vector.extract %216[2, 9, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1536 = vector.insert %1535, %1534 [0, 2, 3, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1537 = vector.extract %216[2, 9, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1538 = vector.insert %1537, %1536 [0, 2, 4, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1539 = vector.extract %216[2, 9, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1540 = vector.insert %1539, %1538 [0, 2, 5, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1541 = vector.extract %216[2, 9, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1542 = vector.insert %1541, %1540 [0, 2, 6, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1543 = vector.extract %216[2, 9, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1544 = vector.insert %1543, %1542 [0, 2, 7, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1545 = vector.extract %216[2, 9, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1546 = vector.insert %1545, %1544 [0, 2, 8, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1547 = vector.extract %216[2, 9, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1548 = vector.insert %1547, %1546 [0, 2, 9, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1549 = vector.extract %216[2, 9, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1550 = vector.insert %1549, %1548 [0, 2, 10, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1551 = vector.extract %216[2, 9, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1552 = vector.insert %1551, %1550 [0, 2, 11, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1553 = vector.extract %216[2, 9, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1554 = vector.insert %1553, %1552 [0, 2, 12, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1555 = vector.extract %216[2, 9, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1556 = vector.insert %1555, %1554 [0, 2, 13, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1557 = vector.extract %216[2, 9, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1558 = vector.insert %1557, %1556 [0, 2, 14, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1559 = vector.extract %216[2, 9, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1560 = vector.insert %1559, %1558 [0, 2, 15, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1561 = vector.extract %216[2, 10, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1562 = vector.insert %1561, %1560 [0, 2, 0, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1563 = vector.extract %216[2, 10, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1564 = vector.insert %1563, %1562 [0, 2, 1, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1565 = vector.extract %216[2, 10, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1566 = vector.insert %1565, %1564 [0, 2, 2, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1567 = vector.extract %216[2, 10, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1568 = vector.insert %1567, %1566 [0, 2, 3, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1569 = vector.extract %216[2, 10, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1570 = vector.insert %1569, %1568 [0, 2, 4, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1571 = vector.extract %216[2, 10, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1572 = vector.insert %1571, %1570 [0, 2, 5, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1573 = vector.extract %216[2, 10, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1574 = vector.insert %1573, %1572 [0, 2, 6, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1575 = vector.extract %216[2, 10, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1576 = vector.insert %1575, %1574 [0, 2, 7, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1577 = vector.extract %216[2, 10, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1578 = vector.insert %1577, %1576 [0, 2, 8, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1579 = vector.extract %216[2, 10, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1580 = vector.insert %1579, %1578 [0, 2, 9, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1581 = vector.extract %216[2, 10, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1582 = vector.insert %1581, %1580 [0, 2, 10, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1583 = vector.extract %216[2, 10, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1584 = vector.insert %1583, %1582 [0, 2, 11, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1585 = vector.extract %216[2, 10, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1586 = vector.insert %1585, %1584 [0, 2, 12, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1587 = vector.extract %216[2, 10, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1588 = vector.insert %1587, %1586 [0, 2, 13, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1589 = vector.extract %216[2, 10, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1590 = vector.insert %1589, %1588 [0, 2, 14, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1591 = vector.extract %216[2, 10, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1592 = vector.insert %1591, %1590 [0, 2, 15, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1593 = vector.extract %216[2, 11, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1594 = vector.insert %1593, %1592 [0, 2, 0, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1595 = vector.extract %216[2, 11, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1596 = vector.insert %1595, %1594 [0, 2, 1, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1597 = vector.extract %216[2, 11, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1598 = vector.insert %1597, %1596 [0, 2, 2, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1599 = vector.extract %216[2, 11, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1600 = vector.insert %1599, %1598 [0, 2, 3, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1601 = vector.extract %216[2, 11, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1602 = vector.insert %1601, %1600 [0, 2, 4, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1603 = vector.extract %216[2, 11, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1604 = vector.insert %1603, %1602 [0, 2, 5, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1605 = vector.extract %216[2, 11, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1606 = vector.insert %1605, %1604 [0, 2, 6, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1607 = vector.extract %216[2, 11, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1608 = vector.insert %1607, %1606 [0, 2, 7, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1609 = vector.extract %216[2, 11, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1610 = vector.insert %1609, %1608 [0, 2, 8, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1611 = vector.extract %216[2, 11, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1612 = vector.insert %1611, %1610 [0, 2, 9, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1613 = vector.extract %216[2, 11, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1614 = vector.insert %1613, %1612 [0, 2, 10, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1615 = vector.extract %216[2, 11, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1616 = vector.insert %1615, %1614 [0, 2, 11, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1617 = vector.extract %216[2, 11, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1618 = vector.insert %1617, %1616 [0, 2, 12, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1619 = vector.extract %216[2, 11, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1620 = vector.insert %1619, %1618 [0, 2, 13, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1621 = vector.extract %216[2, 11, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1622 = vector.insert %1621, %1620 [0, 2, 14, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1623 = vector.extract %216[2, 11, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1624 = vector.insert %1623, %1622 [0, 2, 15, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1625 = vector.extract %216[2, 12, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1626 = vector.insert %1625, %1624 [0, 2, 0, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1627 = vector.extract %216[2, 12, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1628 = vector.insert %1627, %1626 [0, 2, 1, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1629 = vector.extract %216[2, 12, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1630 = vector.insert %1629, %1628 [0, 2, 2, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1631 = vector.extract %216[2, 12, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1632 = vector.insert %1631, %1630 [0, 2, 3, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1633 = vector.extract %216[2, 12, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1634 = vector.insert %1633, %1632 [0, 2, 4, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1635 = vector.extract %216[2, 12, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1636 = vector.insert %1635, %1634 [0, 2, 5, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1637 = vector.extract %216[2, 12, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1638 = vector.insert %1637, %1636 [0, 2, 6, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1639 = vector.extract %216[2, 12, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1640 = vector.insert %1639, %1638 [0, 2, 7, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1641 = vector.extract %216[2, 12, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1642 = vector.insert %1641, %1640 [0, 2, 8, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1643 = vector.extract %216[2, 12, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1644 = vector.insert %1643, %1642 [0, 2, 9, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1645 = vector.extract %216[2, 12, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1646 = vector.insert %1645, %1644 [0, 2, 10, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1647 = vector.extract %216[2, 12, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1648 = vector.insert %1647, %1646 [0, 2, 11, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1649 = vector.extract %216[2, 12, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1650 = vector.insert %1649, %1648 [0, 2, 12, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1651 = vector.extract %216[2, 12, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1652 = vector.insert %1651, %1650 [0, 2, 13, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1653 = vector.extract %216[2, 12, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1654 = vector.insert %1653, %1652 [0, 2, 14, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1655 = vector.extract %216[2, 12, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1656 = vector.insert %1655, %1654 [0, 2, 15, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1657 = vector.extract %216[2, 13, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1658 = vector.insert %1657, %1656 [0, 2, 0, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1659 = vector.extract %216[2, 13, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1660 = vector.insert %1659, %1658 [0, 2, 1, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1661 = vector.extract %216[2, 13, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1662 = vector.insert %1661, %1660 [0, 2, 2, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1663 = vector.extract %216[2, 13, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1664 = vector.insert %1663, %1662 [0, 2, 3, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1665 = vector.extract %216[2, 13, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1666 = vector.insert %1665, %1664 [0, 2, 4, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1667 = vector.extract %216[2, 13, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1668 = vector.insert %1667, %1666 [0, 2, 5, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1669 = vector.extract %216[2, 13, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1670 = vector.insert %1669, %1668 [0, 2, 6, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1671 = vector.extract %216[2, 13, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1672 = vector.insert %1671, %1670 [0, 2, 7, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1673 = vector.extract %216[2, 13, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1674 = vector.insert %1673, %1672 [0, 2, 8, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1675 = vector.extract %216[2, 13, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1676 = vector.insert %1675, %1674 [0, 2, 9, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1677 = vector.extract %216[2, 13, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1678 = vector.insert %1677, %1676 [0, 2, 10, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1679 = vector.extract %216[2, 13, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1680 = vector.insert %1679, %1678 [0, 2, 11, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1681 = vector.extract %216[2, 13, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1682 = vector.insert %1681, %1680 [0, 2, 12, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1683 = vector.extract %216[2, 13, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1684 = vector.insert %1683, %1682 [0, 2, 13, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1685 = vector.extract %216[2, 13, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1686 = vector.insert %1685, %1684 [0, 2, 14, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1687 = vector.extract %216[2, 13, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1688 = vector.insert %1687, %1686 [0, 2, 15, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1689 = vector.extract %216[2, 14, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1690 = vector.insert %1689, %1688 [0, 2, 0, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1691 = vector.extract %216[2, 14, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1692 = vector.insert %1691, %1690 [0, 2, 1, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1693 = vector.extract %216[2, 14, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1694 = vector.insert %1693, %1692 [0, 2, 2, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1695 = vector.extract %216[2, 14, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1696 = vector.insert %1695, %1694 [0, 2, 3, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1697 = vector.extract %216[2, 14, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1698 = vector.insert %1697, %1696 [0, 2, 4, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1699 = vector.extract %216[2, 14, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1700 = vector.insert %1699, %1698 [0, 2, 5, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1701 = vector.extract %216[2, 14, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1702 = vector.insert %1701, %1700 [0, 2, 6, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1703 = vector.extract %216[2, 14, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1704 = vector.insert %1703, %1702 [0, 2, 7, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1705 = vector.extract %216[2, 14, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1706 = vector.insert %1705, %1704 [0, 2, 8, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1707 = vector.extract %216[2, 14, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1708 = vector.insert %1707, %1706 [0, 2, 9, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1709 = vector.extract %216[2, 14, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1710 = vector.insert %1709, %1708 [0, 2, 10, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1711 = vector.extract %216[2, 14, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1712 = vector.insert %1711, %1710 [0, 2, 11, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1713 = vector.extract %216[2, 14, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1714 = vector.insert %1713, %1712 [0, 2, 12, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1715 = vector.extract %216[2, 14, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1716 = vector.insert %1715, %1714 [0, 2, 13, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1717 = vector.extract %216[2, 14, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1718 = vector.insert %1717, %1716 [0, 2, 14, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1719 = vector.extract %216[2, 14, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1720 = vector.insert %1719, %1718 [0, 2, 15, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1721 = vector.extract %216[2, 15, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1722 = vector.insert %1721, %1720 [0, 2, 0, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1723 = vector.extract %216[2, 15, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1724 = vector.insert %1723, %1722 [0, 2, 1, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1725 = vector.extract %216[2, 15, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1726 = vector.insert %1725, %1724 [0, 2, 2, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1727 = vector.extract %216[2, 15, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1728 = vector.insert %1727, %1726 [0, 2, 3, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1729 = vector.extract %216[2, 15, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1730 = vector.insert %1729, %1728 [0, 2, 4, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1731 = vector.extract %216[2, 15, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1732 = vector.insert %1731, %1730 [0, 2, 5, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1733 = vector.extract %216[2, 15, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1734 = vector.insert %1733, %1732 [0, 2, 6, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1735 = vector.extract %216[2, 15, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1736 = vector.insert %1735, %1734 [0, 2, 7, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1737 = vector.extract %216[2, 15, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1738 = vector.insert %1737, %1736 [0, 2, 8, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1739 = vector.extract %216[2, 15, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1740 = vector.insert %1739, %1738 [0, 2, 9, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1741 = vector.extract %216[2, 15, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1742 = vector.insert %1741, %1740 [0, 2, 10, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1743 = vector.extract %216[2, 15, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1744 = vector.insert %1743, %1742 [0, 2, 11, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1745 = vector.extract %216[2, 15, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1746 = vector.insert %1745, %1744 [0, 2, 12, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1747 = vector.extract %216[2, 15, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1748 = vector.insert %1747, %1746 [0, 2, 13, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1749 = vector.extract %216[2, 15, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1750 = vector.insert %1749, %1748 [0, 2, 14, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1751 = vector.extract %216[2, 15, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1752 = vector.insert %1751, %1750 [0, 2, 15, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1753 = vector.extract %216[3, 0, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1754 = vector.insert %1753, %1752 [0, 3, 0, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1755 = vector.extract %216[3, 0, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1756 = vector.insert %1755, %1754 [0, 3, 1, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1757 = vector.extract %216[3, 0, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1758 = vector.insert %1757, %1756 [0, 3, 2, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1759 = vector.extract %216[3, 0, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1760 = vector.insert %1759, %1758 [0, 3, 3, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1761 = vector.extract %216[3, 0, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1762 = vector.insert %1761, %1760 [0, 3, 4, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1763 = vector.extract %216[3, 0, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1764 = vector.insert %1763, %1762 [0, 3, 5, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1765 = vector.extract %216[3, 0, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1766 = vector.insert %1765, %1764 [0, 3, 6, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1767 = vector.extract %216[3, 0, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1768 = vector.insert %1767, %1766 [0, 3, 7, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1769 = vector.extract %216[3, 0, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1770 = vector.insert %1769, %1768 [0, 3, 8, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1771 = vector.extract %216[3, 0, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1772 = vector.insert %1771, %1770 [0, 3, 9, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1773 = vector.extract %216[3, 0, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1774 = vector.insert %1773, %1772 [0, 3, 10, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1775 = vector.extract %216[3, 0, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1776 = vector.insert %1775, %1774 [0, 3, 11, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1777 = vector.extract %216[3, 0, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1778 = vector.insert %1777, %1776 [0, 3, 12, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1779 = vector.extract %216[3, 0, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1780 = vector.insert %1779, %1778 [0, 3, 13, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1781 = vector.extract %216[3, 0, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1782 = vector.insert %1781, %1780 [0, 3, 14, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1783 = vector.extract %216[3, 0, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1784 = vector.insert %1783, %1782 [0, 3, 15, 0] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1785 = vector.extract %216[3, 1, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1786 = vector.insert %1785, %1784 [0, 3, 0, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1787 = vector.extract %216[3, 1, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1788 = vector.insert %1787, %1786 [0, 3, 1, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1789 = vector.extract %216[3, 1, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1790 = vector.insert %1789, %1788 [0, 3, 2, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1791 = vector.extract %216[3, 1, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1792 = vector.insert %1791, %1790 [0, 3, 3, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1793 = vector.extract %216[3, 1, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1794 = vector.insert %1793, %1792 [0, 3, 4, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1795 = vector.extract %216[3, 1, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1796 = vector.insert %1795, %1794 [0, 3, 5, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1797 = vector.extract %216[3, 1, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1798 = vector.insert %1797, %1796 [0, 3, 6, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1799 = vector.extract %216[3, 1, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1800 = vector.insert %1799, %1798 [0, 3, 7, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1801 = vector.extract %216[3, 1, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1802 = vector.insert %1801, %1800 [0, 3, 8, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1803 = vector.extract %216[3, 1, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1804 = vector.insert %1803, %1802 [0, 3, 9, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1805 = vector.extract %216[3, 1, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1806 = vector.insert %1805, %1804 [0, 3, 10, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1807 = vector.extract %216[3, 1, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1808 = vector.insert %1807, %1806 [0, 3, 11, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1809 = vector.extract %216[3, 1, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1810 = vector.insert %1809, %1808 [0, 3, 12, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1811 = vector.extract %216[3, 1, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1812 = vector.insert %1811, %1810 [0, 3, 13, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1813 = vector.extract %216[3, 1, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1814 = vector.insert %1813, %1812 [0, 3, 14, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1815 = vector.extract %216[3, 1, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1816 = vector.insert %1815, %1814 [0, 3, 15, 1] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1817 = vector.extract %216[3, 2, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1818 = vector.insert %1817, %1816 [0, 3, 0, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1819 = vector.extract %216[3, 2, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1820 = vector.insert %1819, %1818 [0, 3, 1, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1821 = vector.extract %216[3, 2, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1822 = vector.insert %1821, %1820 [0, 3, 2, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1823 = vector.extract %216[3, 2, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1824 = vector.insert %1823, %1822 [0, 3, 3, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1825 = vector.extract %216[3, 2, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1826 = vector.insert %1825, %1824 [0, 3, 4, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1827 = vector.extract %216[3, 2, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1828 = vector.insert %1827, %1826 [0, 3, 5, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1829 = vector.extract %216[3, 2, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1830 = vector.insert %1829, %1828 [0, 3, 6, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1831 = vector.extract %216[3, 2, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1832 = vector.insert %1831, %1830 [0, 3, 7, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1833 = vector.extract %216[3, 2, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1834 = vector.insert %1833, %1832 [0, 3, 8, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1835 = vector.extract %216[3, 2, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1836 = vector.insert %1835, %1834 [0, 3, 9, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1837 = vector.extract %216[3, 2, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1838 = vector.insert %1837, %1836 [0, 3, 10, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1839 = vector.extract %216[3, 2, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1840 = vector.insert %1839, %1838 [0, 3, 11, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1841 = vector.extract %216[3, 2, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1842 = vector.insert %1841, %1840 [0, 3, 12, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1843 = vector.extract %216[3, 2, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1844 = vector.insert %1843, %1842 [0, 3, 13, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1845 = vector.extract %216[3, 2, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1846 = vector.insert %1845, %1844 [0, 3, 14, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1847 = vector.extract %216[3, 2, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1848 = vector.insert %1847, %1846 [0, 3, 15, 2] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1849 = vector.extract %216[3, 3, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1850 = vector.insert %1849, %1848 [0, 3, 0, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1851 = vector.extract %216[3, 3, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1852 = vector.insert %1851, %1850 [0, 3, 1, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1853 = vector.extract %216[3, 3, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1854 = vector.insert %1853, %1852 [0, 3, 2, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1855 = vector.extract %216[3, 3, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1856 = vector.insert %1855, %1854 [0, 3, 3, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1857 = vector.extract %216[3, 3, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1858 = vector.insert %1857, %1856 [0, 3, 4, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1859 = vector.extract %216[3, 3, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1860 = vector.insert %1859, %1858 [0, 3, 5, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1861 = vector.extract %216[3, 3, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1862 = vector.insert %1861, %1860 [0, 3, 6, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1863 = vector.extract %216[3, 3, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1864 = vector.insert %1863, %1862 [0, 3, 7, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1865 = vector.extract %216[3, 3, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1866 = vector.insert %1865, %1864 [0, 3, 8, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1867 = vector.extract %216[3, 3, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1868 = vector.insert %1867, %1866 [0, 3, 9, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1869 = vector.extract %216[3, 3, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1870 = vector.insert %1869, %1868 [0, 3, 10, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1871 = vector.extract %216[3, 3, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1872 = vector.insert %1871, %1870 [0, 3, 11, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1873 = vector.extract %216[3, 3, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1874 = vector.insert %1873, %1872 [0, 3, 12, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1875 = vector.extract %216[3, 3, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1876 = vector.insert %1875, %1874 [0, 3, 13, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1877 = vector.extract %216[3, 3, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1878 = vector.insert %1877, %1876 [0, 3, 14, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1879 = vector.extract %216[3, 3, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1880 = vector.insert %1879, %1878 [0, 3, 15, 3] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1881 = vector.extract %216[3, 4, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1882 = vector.insert %1881, %1880 [0, 3, 0, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1883 = vector.extract %216[3, 4, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1884 = vector.insert %1883, %1882 [0, 3, 1, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1885 = vector.extract %216[3, 4, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1886 = vector.insert %1885, %1884 [0, 3, 2, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1887 = vector.extract %216[3, 4, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1888 = vector.insert %1887, %1886 [0, 3, 3, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1889 = vector.extract %216[3, 4, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1890 = vector.insert %1889, %1888 [0, 3, 4, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1891 = vector.extract %216[3, 4, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1892 = vector.insert %1891, %1890 [0, 3, 5, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1893 = vector.extract %216[3, 4, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1894 = vector.insert %1893, %1892 [0, 3, 6, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1895 = vector.extract %216[3, 4, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1896 = vector.insert %1895, %1894 [0, 3, 7, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1897 = vector.extract %216[3, 4, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1898 = vector.insert %1897, %1896 [0, 3, 8, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1899 = vector.extract %216[3, 4, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1900 = vector.insert %1899, %1898 [0, 3, 9, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1901 = vector.extract %216[3, 4, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1902 = vector.insert %1901, %1900 [0, 3, 10, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1903 = vector.extract %216[3, 4, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1904 = vector.insert %1903, %1902 [0, 3, 11, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1905 = vector.extract %216[3, 4, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1906 = vector.insert %1905, %1904 [0, 3, 12, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1907 = vector.extract %216[3, 4, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1908 = vector.insert %1907, %1906 [0, 3, 13, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1909 = vector.extract %216[3, 4, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1910 = vector.insert %1909, %1908 [0, 3, 14, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1911 = vector.extract %216[3, 4, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1912 = vector.insert %1911, %1910 [0, 3, 15, 4] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1913 = vector.extract %216[3, 5, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1914 = vector.insert %1913, %1912 [0, 3, 0, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1915 = vector.extract %216[3, 5, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1916 = vector.insert %1915, %1914 [0, 3, 1, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1917 = vector.extract %216[3, 5, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1918 = vector.insert %1917, %1916 [0, 3, 2, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1919 = vector.extract %216[3, 5, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1920 = vector.insert %1919, %1918 [0, 3, 3, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1921 = vector.extract %216[3, 5, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1922 = vector.insert %1921, %1920 [0, 3, 4, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1923 = vector.extract %216[3, 5, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1924 = vector.insert %1923, %1922 [0, 3, 5, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1925 = vector.extract %216[3, 5, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1926 = vector.insert %1925, %1924 [0, 3, 6, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1927 = vector.extract %216[3, 5, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1928 = vector.insert %1927, %1926 [0, 3, 7, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1929 = vector.extract %216[3, 5, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1930 = vector.insert %1929, %1928 [0, 3, 8, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1931 = vector.extract %216[3, 5, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1932 = vector.insert %1931, %1930 [0, 3, 9, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1933 = vector.extract %216[3, 5, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1934 = vector.insert %1933, %1932 [0, 3, 10, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1935 = vector.extract %216[3, 5, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1936 = vector.insert %1935, %1934 [0, 3, 11, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1937 = vector.extract %216[3, 5, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1938 = vector.insert %1937, %1936 [0, 3, 12, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1939 = vector.extract %216[3, 5, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1940 = vector.insert %1939, %1938 [0, 3, 13, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1941 = vector.extract %216[3, 5, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1942 = vector.insert %1941, %1940 [0, 3, 14, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1943 = vector.extract %216[3, 5, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1944 = vector.insert %1943, %1942 [0, 3, 15, 5] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1945 = vector.extract %216[3, 6, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1946 = vector.insert %1945, %1944 [0, 3, 0, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1947 = vector.extract %216[3, 6, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1948 = vector.insert %1947, %1946 [0, 3, 1, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1949 = vector.extract %216[3, 6, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1950 = vector.insert %1949, %1948 [0, 3, 2, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1951 = vector.extract %216[3, 6, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1952 = vector.insert %1951, %1950 [0, 3, 3, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1953 = vector.extract %216[3, 6, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1954 = vector.insert %1953, %1952 [0, 3, 4, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1955 = vector.extract %216[3, 6, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1956 = vector.insert %1955, %1954 [0, 3, 5, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1957 = vector.extract %216[3, 6, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1958 = vector.insert %1957, %1956 [0, 3, 6, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1959 = vector.extract %216[3, 6, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1960 = vector.insert %1959, %1958 [0, 3, 7, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1961 = vector.extract %216[3, 6, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1962 = vector.insert %1961, %1960 [0, 3, 8, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1963 = vector.extract %216[3, 6, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1964 = vector.insert %1963, %1962 [0, 3, 9, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1965 = vector.extract %216[3, 6, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1966 = vector.insert %1965, %1964 [0, 3, 10, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1967 = vector.extract %216[3, 6, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%1968 = vector.insert %1967, %1966 [0, 3, 11, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1969 = vector.extract %216[3, 6, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%1970 = vector.insert %1969, %1968 [0, 3, 12, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1971 = vector.extract %216[3, 6, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%1972 = vector.insert %1971, %1970 [0, 3, 13, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1973 = vector.extract %216[3, 6, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%1974 = vector.insert %1973, %1972 [0, 3, 14, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1975 = vector.extract %216[3, 6, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%1976 = vector.insert %1975, %1974 [0, 3, 15, 6] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1977 = vector.extract %216[3, 7, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%1978 = vector.insert %1977, %1976 [0, 3, 0, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1979 = vector.extract %216[3, 7, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%1980 = vector.insert %1979, %1978 [0, 3, 1, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1981 = vector.extract %216[3, 7, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%1982 = vector.insert %1981, %1980 [0, 3, 2, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1983 = vector.extract %216[3, 7, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%1984 = vector.insert %1983, %1982 [0, 3, 3, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1985 = vector.extract %216[3, 7, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%1986 = vector.insert %1985, %1984 [0, 3, 4, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1987 = vector.extract %216[3, 7, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%1988 = vector.insert %1987, %1986 [0, 3, 5, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1989 = vector.extract %216[3, 7, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%1990 = vector.insert %1989, %1988 [0, 3, 6, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1991 = vector.extract %216[3, 7, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%1992 = vector.insert %1991, %1990 [0, 3, 7, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1993 = vector.extract %216[3, 7, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%1994 = vector.insert %1993, %1992 [0, 3, 8, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1995 = vector.extract %216[3, 7, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%1996 = vector.insert %1995, %1994 [0, 3, 9, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1997 = vector.extract %216[3, 7, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%1998 = vector.insert %1997, %1996 [0, 3, 10, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%1999 = vector.extract %216[3, 7, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2000 = vector.insert %1999, %1998 [0, 3, 11, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2001 = vector.extract %216[3, 7, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2002 = vector.insert %2001, %2000 [0, 3, 12, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2003 = vector.extract %216[3, 7, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2004 = vector.insert %2003, %2002 [0, 3, 13, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2005 = vector.extract %216[3, 7, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2006 = vector.insert %2005, %2004 [0, 3, 14, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2007 = vector.extract %216[3, 7, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2008 = vector.insert %2007, %2006 [0, 3, 15, 7] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2009 = vector.extract %216[3, 8, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2010 = vector.insert %2009, %2008 [0, 3, 0, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2011 = vector.extract %216[3, 8, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2012 = vector.insert %2011, %2010 [0, 3, 1, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2013 = vector.extract %216[3, 8, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2014 = vector.insert %2013, %2012 [0, 3, 2, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2015 = vector.extract %216[3, 8, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2016 = vector.insert %2015, %2014 [0, 3, 3, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2017 = vector.extract %216[3, 8, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2018 = vector.insert %2017, %2016 [0, 3, 4, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2019 = vector.extract %216[3, 8, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2020 = vector.insert %2019, %2018 [0, 3, 5, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2021 = vector.extract %216[3, 8, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2022 = vector.insert %2021, %2020 [0, 3, 6, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2023 = vector.extract %216[3, 8, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2024 = vector.insert %2023, %2022 [0, 3, 7, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2025 = vector.extract %216[3, 8, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2026 = vector.insert %2025, %2024 [0, 3, 8, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2027 = vector.extract %216[3, 8, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2028 = vector.insert %2027, %2026 [0, 3, 9, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2029 = vector.extract %216[3, 8, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2030 = vector.insert %2029, %2028 [0, 3, 10, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2031 = vector.extract %216[3, 8, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2032 = vector.insert %2031, %2030 [0, 3, 11, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2033 = vector.extract %216[3, 8, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2034 = vector.insert %2033, %2032 [0, 3, 12, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2035 = vector.extract %216[3, 8, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2036 = vector.insert %2035, %2034 [0, 3, 13, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2037 = vector.extract %216[3, 8, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2038 = vector.insert %2037, %2036 [0, 3, 14, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2039 = vector.extract %216[3, 8, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2040 = vector.insert %2039, %2038 [0, 3, 15, 8] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2041 = vector.extract %216[3, 9, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2042 = vector.insert %2041, %2040 [0, 3, 0, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2043 = vector.extract %216[3, 9, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2044 = vector.insert %2043, %2042 [0, 3, 1, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2045 = vector.extract %216[3, 9, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2046 = vector.insert %2045, %2044 [0, 3, 2, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2047 = vector.extract %216[3, 9, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2048 = vector.insert %2047, %2046 [0, 3, 3, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2049 = vector.extract %216[3, 9, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2050 = vector.insert %2049, %2048 [0, 3, 4, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2051 = vector.extract %216[3, 9, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2052 = vector.insert %2051, %2050 [0, 3, 5, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2053 = vector.extract %216[3, 9, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2054 = vector.insert %2053, %2052 [0, 3, 6, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2055 = vector.extract %216[3, 9, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2056 = vector.insert %2055, %2054 [0, 3, 7, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2057 = vector.extract %216[3, 9, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2058 = vector.insert %2057, %2056 [0, 3, 8, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2059 = vector.extract %216[3, 9, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2060 = vector.insert %2059, %2058 [0, 3, 9, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2061 = vector.extract %216[3, 9, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2062 = vector.insert %2061, %2060 [0, 3, 10, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2063 = vector.extract %216[3, 9, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2064 = vector.insert %2063, %2062 [0, 3, 11, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2065 = vector.extract %216[3, 9, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2066 = vector.insert %2065, %2064 [0, 3, 12, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2067 = vector.extract %216[3, 9, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2068 = vector.insert %2067, %2066 [0, 3, 13, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2069 = vector.extract %216[3, 9, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2070 = vector.insert %2069, %2068 [0, 3, 14, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2071 = vector.extract %216[3, 9, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2072 = vector.insert %2071, %2070 [0, 3, 15, 9] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2073 = vector.extract %216[3, 10, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2074 = vector.insert %2073, %2072 [0, 3, 0, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2075 = vector.extract %216[3, 10, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2076 = vector.insert %2075, %2074 [0, 3, 1, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2077 = vector.extract %216[3, 10, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2078 = vector.insert %2077, %2076 [0, 3, 2, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2079 = vector.extract %216[3, 10, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2080 = vector.insert %2079, %2078 [0, 3, 3, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2081 = vector.extract %216[3, 10, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2082 = vector.insert %2081, %2080 [0, 3, 4, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2083 = vector.extract %216[3, 10, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2084 = vector.insert %2083, %2082 [0, 3, 5, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2085 = vector.extract %216[3, 10, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2086 = vector.insert %2085, %2084 [0, 3, 6, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2087 = vector.extract %216[3, 10, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2088 = vector.insert %2087, %2086 [0, 3, 7, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2089 = vector.extract %216[3, 10, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2090 = vector.insert %2089, %2088 [0, 3, 8, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2091 = vector.extract %216[3, 10, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2092 = vector.insert %2091, %2090 [0, 3, 9, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2093 = vector.extract %216[3, 10, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2094 = vector.insert %2093, %2092 [0, 3, 10, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2095 = vector.extract %216[3, 10, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2096 = vector.insert %2095, %2094 [0, 3, 11, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2097 = vector.extract %216[3, 10, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2098 = vector.insert %2097, %2096 [0, 3, 12, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2099 = vector.extract %216[3, 10, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2100 = vector.insert %2099, %2098 [0, 3, 13, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2101 = vector.extract %216[3, 10, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2102 = vector.insert %2101, %2100 [0, 3, 14, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2103 = vector.extract %216[3, 10, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2104 = vector.insert %2103, %2102 [0, 3, 15, 10] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2105 = vector.extract %216[3, 11, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2106 = vector.insert %2105, %2104 [0, 3, 0, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2107 = vector.extract %216[3, 11, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2108 = vector.insert %2107, %2106 [0, 3, 1, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2109 = vector.extract %216[3, 11, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2110 = vector.insert %2109, %2108 [0, 3, 2, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2111 = vector.extract %216[3, 11, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2112 = vector.insert %2111, %2110 [0, 3, 3, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2113 = vector.extract %216[3, 11, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2114 = vector.insert %2113, %2112 [0, 3, 4, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2115 = vector.extract %216[3, 11, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2116 = vector.insert %2115, %2114 [0, 3, 5, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2117 = vector.extract %216[3, 11, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2118 = vector.insert %2117, %2116 [0, 3, 6, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2119 = vector.extract %216[3, 11, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2120 = vector.insert %2119, %2118 [0, 3, 7, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2121 = vector.extract %216[3, 11, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2122 = vector.insert %2121, %2120 [0, 3, 8, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2123 = vector.extract %216[3, 11, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2124 = vector.insert %2123, %2122 [0, 3, 9, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2125 = vector.extract %216[3, 11, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2126 = vector.insert %2125, %2124 [0, 3, 10, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2127 = vector.extract %216[3, 11, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2128 = vector.insert %2127, %2126 [0, 3, 11, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2129 = vector.extract %216[3, 11, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2130 = vector.insert %2129, %2128 [0, 3, 12, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2131 = vector.extract %216[3, 11, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2132 = vector.insert %2131, %2130 [0, 3, 13, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2133 = vector.extract %216[3, 11, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2134 = vector.insert %2133, %2132 [0, 3, 14, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2135 = vector.extract %216[3, 11, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2136 = vector.insert %2135, %2134 [0, 3, 15, 11] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2137 = vector.extract %216[3, 12, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2138 = vector.insert %2137, %2136 [0, 3, 0, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2139 = vector.extract %216[3, 12, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2140 = vector.insert %2139, %2138 [0, 3, 1, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2141 = vector.extract %216[3, 12, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2142 = vector.insert %2141, %2140 [0, 3, 2, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2143 = vector.extract %216[3, 12, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2144 = vector.insert %2143, %2142 [0, 3, 3, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2145 = vector.extract %216[3, 12, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2146 = vector.insert %2145, %2144 [0, 3, 4, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2147 = vector.extract %216[3, 12, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2148 = vector.insert %2147, %2146 [0, 3, 5, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2149 = vector.extract %216[3, 12, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2150 = vector.insert %2149, %2148 [0, 3, 6, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2151 = vector.extract %216[3, 12, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2152 = vector.insert %2151, %2150 [0, 3, 7, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2153 = vector.extract %216[3, 12, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2154 = vector.insert %2153, %2152 [0, 3, 8, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2155 = vector.extract %216[3, 12, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2156 = vector.insert %2155, %2154 [0, 3, 9, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2157 = vector.extract %216[3, 12, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2158 = vector.insert %2157, %2156 [0, 3, 10, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2159 = vector.extract %216[3, 12, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2160 = vector.insert %2159, %2158 [0, 3, 11, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2161 = vector.extract %216[3, 12, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2162 = vector.insert %2161, %2160 [0, 3, 12, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2163 = vector.extract %216[3, 12, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2164 = vector.insert %2163, %2162 [0, 3, 13, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2165 = vector.extract %216[3, 12, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2166 = vector.insert %2165, %2164 [0, 3, 14, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2167 = vector.extract %216[3, 12, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2168 = vector.insert %2167, %2166 [0, 3, 15, 12] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2169 = vector.extract %216[3, 13, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2170 = vector.insert %2169, %2168 [0, 3, 0, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2171 = vector.extract %216[3, 13, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2172 = vector.insert %2171, %2170 [0, 3, 1, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2173 = vector.extract %216[3, 13, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2174 = vector.insert %2173, %2172 [0, 3, 2, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2175 = vector.extract %216[3, 13, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2176 = vector.insert %2175, %2174 [0, 3, 3, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2177 = vector.extract %216[3, 13, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2178 = vector.insert %2177, %2176 [0, 3, 4, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2179 = vector.extract %216[3, 13, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2180 = vector.insert %2179, %2178 [0, 3, 5, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2181 = vector.extract %216[3, 13, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2182 = vector.insert %2181, %2180 [0, 3, 6, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2183 = vector.extract %216[3, 13, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2184 = vector.insert %2183, %2182 [0, 3, 7, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2185 = vector.extract %216[3, 13, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2186 = vector.insert %2185, %2184 [0, 3, 8, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2187 = vector.extract %216[3, 13, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2188 = vector.insert %2187, %2186 [0, 3, 9, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2189 = vector.extract %216[3, 13, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2190 = vector.insert %2189, %2188 [0, 3, 10, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2191 = vector.extract %216[3, 13, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2192 = vector.insert %2191, %2190 [0, 3, 11, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2193 = vector.extract %216[3, 13, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2194 = vector.insert %2193, %2192 [0, 3, 12, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2195 = vector.extract %216[3, 13, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2196 = vector.insert %2195, %2194 [0, 3, 13, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2197 = vector.extract %216[3, 13, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2198 = vector.insert %2197, %2196 [0, 3, 14, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2199 = vector.extract %216[3, 13, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2200 = vector.insert %2199, %2198 [0, 3, 15, 13] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2201 = vector.extract %216[3, 14, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2202 = vector.insert %2201, %2200 [0, 3, 0, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2203 = vector.extract %216[3, 14, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2204 = vector.insert %2203, %2202 [0, 3, 1, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2205 = vector.extract %216[3, 14, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2206 = vector.insert %2205, %2204 [0, 3, 2, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2207 = vector.extract %216[3, 14, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2208 = vector.insert %2207, %2206 [0, 3, 3, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2209 = vector.extract %216[3, 14, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2210 = vector.insert %2209, %2208 [0, 3, 4, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2211 = vector.extract %216[3, 14, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2212 = vector.insert %2211, %2210 [0, 3, 5, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2213 = vector.extract %216[3, 14, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2214 = vector.insert %2213, %2212 [0, 3, 6, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2215 = vector.extract %216[3, 14, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2216 = vector.insert %2215, %2214 [0, 3, 7, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2217 = vector.extract %216[3, 14, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2218 = vector.insert %2217, %2216 [0, 3, 8, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2219 = vector.extract %216[3, 14, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2220 = vector.insert %2219, %2218 [0, 3, 9, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2221 = vector.extract %216[3, 14, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2222 = vector.insert %2221, %2220 [0, 3, 10, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2223 = vector.extract %216[3, 14, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2224 = vector.insert %2223, %2222 [0, 3, 11, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2225 = vector.extract %216[3, 14, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2226 = vector.insert %2225, %2224 [0, 3, 12, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2227 = vector.extract %216[3, 14, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2228 = vector.insert %2227, %2226 [0, 3, 13, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2229 = vector.extract %216[3, 14, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2230 = vector.insert %2229, %2228 [0, 3, 14, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2231 = vector.extract %216[3, 14, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2232 = vector.insert %2231, %2230 [0, 3, 15, 14] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2233 = vector.extract %216[3, 15, 0] : vector<1xf16> from vector<4x16x16x1xf16>
%2234 = vector.insert %2233, %2232 [0, 3, 0, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2235 = vector.extract %216[3, 15, 1] : vector<1xf16> from vector<4x16x16x1xf16>
%2236 = vector.insert %2235, %2234 [0, 3, 1, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2237 = vector.extract %216[3, 15, 2] : vector<1xf16> from vector<4x16x16x1xf16>
%2238 = vector.insert %2237, %2236 [0, 3, 2, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2239 = vector.extract %216[3, 15, 3] : vector<1xf16> from vector<4x16x16x1xf16>
%2240 = vector.insert %2239, %2238 [0, 3, 3, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2241 = vector.extract %216[3, 15, 4] : vector<1xf16> from vector<4x16x16x1xf16>
%2242 = vector.insert %2241, %2240 [0, 3, 4, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2243 = vector.extract %216[3, 15, 5] : vector<1xf16> from vector<4x16x16x1xf16>
%2244 = vector.insert %2243, %2242 [0, 3, 5, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2245 = vector.extract %216[3, 15, 6] : vector<1xf16> from vector<4x16x16x1xf16>
%2246 = vector.insert %2245, %2244 [0, 3, 6, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2247 = vector.extract %216[3, 15, 7] : vector<1xf16> from vector<4x16x16x1xf16>
%2248 = vector.insert %2247, %2246 [0, 3, 7, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2249 = vector.extract %216[3, 15, 8] : vector<1xf16> from vector<4x16x16x1xf16>
%2250 = vector.insert %2249, %2248 [0, 3, 8, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2251 = vector.extract %216[3, 15, 9] : vector<1xf16> from vector<4x16x16x1xf16>
%2252 = vector.insert %2251, %2250 [0, 3, 9, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2253 = vector.extract %216[3, 15, 10] : vector<1xf16> from vector<4x16x16x1xf16>
%2254 = vector.insert %2253, %2252 [0, 3, 10, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2255 = vector.extract %216[3, 15, 11] : vector<1xf16> from vector<4x16x16x1xf16>
%2256 = vector.insert %2255, %2254 [0, 3, 11, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2257 = vector.extract %216[3, 15, 12] : vector<1xf16> from vector<4x16x16x1xf16>
%2258 = vector.insert %2257, %2256 [0, 3, 12, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2259 = vector.extract %216[3, 15, 13] : vector<1xf16> from vector<4x16x16x1xf16>
%2260 = vector.insert %2259, %2258 [0, 3, 13, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2261 = vector.extract %216[3, 15, 14] : vector<1xf16> from vector<4x16x16x1xf16>
%2262 = vector.insert %2261, %2260 [0, 3, 14, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2263 = vector.extract %216[3, 15, 15] : vector<1xf16> from vector<4x16x16x1xf16>
%2264 = vector.insert %2263, %2262 [0, 3, 15, 15] : vector<1xf16> into vector<1x4x16x16x1xf16>
%2265 = vector.extract %2264[0] : vector<4x16x16x1xf16> from vector<1x4x16x16x1xf16>
%subview_5 = memref.subview %subview[0, 0, 0, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%2266 = vector.shape_cast %2265 : vector<4x16x16x1xf16> to vector<4x16x16xf16>
%2267 = vector.extract %2266[0, 0] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2267, %subview_5[%arg3, %c0, %arg4, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2268 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
%2269 = vector.extract %2266[0, 1] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2269, %subview_5[%arg3, %c0, %2268, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2270 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
%2271 = vector.extract %2266[0, 2] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2271, %subview_5[%arg3, %c0, %2270, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2272 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg4)
%2273 = vector.extract %2266[0, 3] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2273, %subview_5[%arg3, %c0, %2272, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2274 = affine.apply affine_map<(d0) -> (d0 + 4)>(%arg4)
%2275 = vector.extract %2266[0, 4] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2275, %subview_5[%arg3, %c0, %2274, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2276 = affine.apply affine_map<(d0) -> (d0 + 5)>(%arg4)
%2277 = vector.extract %2266[0, 5] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2277, %subview_5[%arg3, %c0, %2276, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2278 = affine.apply affine_map<(d0) -> (d0 + 6)>(%arg4)
%2279 = vector.extract %2266[0, 6] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2279, %subview_5[%arg3, %c0, %2278, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2280 = affine.apply affine_map<(d0) -> (d0 + 7)>(%arg4)
%2281 = vector.extract %2266[0, 7] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2281, %subview_5[%arg3, %c0, %2280, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2282 = affine.apply affine_map<(d0) -> (d0 + 8)>(%arg4)
%2283 = vector.extract %2266[0, 8] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2283, %subview_5[%arg3, %c0, %2282, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2284 = affine.apply affine_map<(d0) -> (d0 + 9)>(%arg4)
%2285 = vector.extract %2266[0, 9] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2285, %subview_5[%arg3, %c0, %2284, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2286 = affine.apply affine_map<(d0) -> (d0 + 10)>(%arg4)
%2287 = vector.extract %2266[0, 10] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2287, %subview_5[%arg3, %c0, %2286, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2288 = affine.apply affine_map<(d0) -> (d0 + 11)>(%arg4)
%2289 = vector.extract %2266[0, 11] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2289, %subview_5[%arg3, %c0, %2288, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2290 = affine.apply affine_map<(d0) -> (d0 + 12)>(%arg4)
%2291 = vector.extract %2266[0, 12] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2291, %subview_5[%arg3, %c0, %2290, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2292 = affine.apply affine_map<(d0) -> (d0 + 13)>(%arg4)
%2293 = vector.extract %2266[0, 13] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2293, %subview_5[%arg3, %c0, %2292, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2294 = affine.apply affine_map<(d0) -> (d0 + 14)>(%arg4)
%2295 = vector.extract %2266[0, 14] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2295, %subview_5[%arg3, %c0, %2294, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2296 = affine.apply affine_map<(d0) -> (d0 + 15)>(%arg4)
%2297 = vector.extract %2266[0, 15] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2297, %subview_5[%arg3, %c0, %2296, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2298 = vector.extract %2266[1, 0] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2298, %subview_5[%arg3, %c1, %arg4, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2299 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
%2300 = vector.extract %2266[1, 1] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2300, %subview_5[%arg3, %c1, %2299, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2301 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
%2302 = vector.extract %2266[1, 2] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2302, %subview_5[%arg3, %c1, %2301, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2303 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg4)
%2304 = vector.extract %2266[1, 3] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2304, %subview_5[%arg3, %c1, %2303, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2305 = affine.apply affine_map<(d0) -> (d0 + 4)>(%arg4)
%2306 = vector.extract %2266[1, 4] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2306, %subview_5[%arg3, %c1, %2305, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2307 = affine.apply affine_map<(d0) -> (d0 + 5)>(%arg4)
%2308 = vector.extract %2266[1, 5] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2308, %subview_5[%arg3, %c1, %2307, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2309 = affine.apply affine_map<(d0) -> (d0 + 6)>(%arg4)
%2310 = vector.extract %2266[1, 6] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2310, %subview_5[%arg3, %c1, %2309, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2311 = affine.apply affine_map<(d0) -> (d0 + 7)>(%arg4)
%2312 = vector.extract %2266[1, 7] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2312, %subview_5[%arg3, %c1, %2311, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2313 = affine.apply affine_map<(d0) -> (d0 + 8)>(%arg4)
%2314 = vector.extract %2266[1, 8] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2314, %subview_5[%arg3, %c1, %2313, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2315 = affine.apply affine_map<(d0) -> (d0 + 9)>(%arg4)
%2316 = vector.extract %2266[1, 9] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2316, %subview_5[%arg3, %c1, %2315, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2317 = affine.apply affine_map<(d0) -> (d0 + 10)>(%arg4)
%2318 = vector.extract %2266[1, 10] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2318, %subview_5[%arg3, %c1, %2317, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2319 = affine.apply affine_map<(d0) -> (d0 + 11)>(%arg4)
%2320 = vector.extract %2266[1, 11] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2320, %subview_5[%arg3, %c1, %2319, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2321 = affine.apply affine_map<(d0) -> (d0 + 12)>(%arg4)
%2322 = vector.extract %2266[1, 12] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2322, %subview_5[%arg3, %c1, %2321, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2323 = affine.apply affine_map<(d0) -> (d0 + 13)>(%arg4)
%2324 = vector.extract %2266[1, 13] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2324, %subview_5[%arg3, %c1, %2323, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2325 = affine.apply affine_map<(d0) -> (d0 + 14)>(%arg4)
%2326 = vector.extract %2266[1, 14] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2326, %subview_5[%arg3, %c1, %2325, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2327 = affine.apply affine_map<(d0) -> (d0 + 15)>(%arg4)
%2328 = vector.extract %2266[1, 15] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2328, %subview_5[%arg3, %c1, %2327, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2329 = vector.extract %2266[2, 0] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2329, %subview_5[%arg3, %c2, %arg4, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2330 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
%2331 = vector.extract %2266[2, 1] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2331, %subview_5[%arg3, %c2, %2330, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2332 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
%2333 = vector.extract %2266[2, 2] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2333, %subview_5[%arg3, %c2, %2332, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2334 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg4)
%2335 = vector.extract %2266[2, 3] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2335, %subview_5[%arg3, %c2, %2334, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2336 = affine.apply affine_map<(d0) -> (d0 + 4)>(%arg4)
%2337 = vector.extract %2266[2, 4] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2337, %subview_5[%arg3, %c2, %2336, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2338 = affine.apply affine_map<(d0) -> (d0 + 5)>(%arg4)
%2339 = vector.extract %2266[2, 5] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2339, %subview_5[%arg3, %c2, %2338, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2340 = affine.apply affine_map<(d0) -> (d0 + 6)>(%arg4)
%2341 = vector.extract %2266[2, 6] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2341, %subview_5[%arg3, %c2, %2340, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2342 = affine.apply affine_map<(d0) -> (d0 + 7)>(%arg4)
%2343 = vector.extract %2266[2, 7] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2343, %subview_5[%arg3, %c2, %2342, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2344 = affine.apply affine_map<(d0) -> (d0 + 8)>(%arg4)
%2345 = vector.extract %2266[2, 8] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2345, %subview_5[%arg3, %c2, %2344, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2346 = affine.apply affine_map<(d0) -> (d0 + 9)>(%arg4)
%2347 = vector.extract %2266[2, 9] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2347, %subview_5[%arg3, %c2, %2346, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2348 = affine.apply affine_map<(d0) -> (d0 + 10)>(%arg4)
%2349 = vector.extract %2266[2, 10] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2349, %subview_5[%arg3, %c2, %2348, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2350 = affine.apply affine_map<(d0) -> (d0 + 11)>(%arg4)
%2351 = vector.extract %2266[2, 11] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2351, %subview_5[%arg3, %c2, %2350, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2352 = affine.apply affine_map<(d0) -> (d0 + 12)>(%arg4)
%2353 = vector.extract %2266[2, 12] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2353, %subview_5[%arg3, %c2, %2352, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2354 = affine.apply affine_map<(d0) -> (d0 + 13)>(%arg4)
%2355 = vector.extract %2266[2, 13] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2355, %subview_5[%arg3, %c2, %2354, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2356 = affine.apply affine_map<(d0) -> (d0 + 14)>(%arg4)
%2357 = vector.extract %2266[2, 14] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2357, %subview_5[%arg3, %c2, %2356, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2358 = affine.apply affine_map<(d0) -> (d0 + 15)>(%arg4)
%2359 = vector.extract %2266[2, 15] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2359, %subview_5[%arg3, %c2, %2358, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2360 = vector.extract %2266[3, 0] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2360, %subview_5[%arg3, %c3, %arg4, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2361 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg4)
%2362 = vector.extract %2266[3, 1] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2362, %subview_5[%arg3, %c3, %2361, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2363 = affine.apply affine_map<(d0) -> (d0 + 2)>(%arg4)
%2364 = vector.extract %2266[3, 2] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2364, %subview_5[%arg3, %c3, %2363, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2365 = affine.apply affine_map<(d0) -> (d0 + 3)>(%arg4)
%2366 = vector.extract %2266[3, 3] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2366, %subview_5[%arg3, %c3, %2365, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2367 = affine.apply affine_map<(d0) -> (d0 + 4)>(%arg4)
%2368 = vector.extract %2266[3, 4] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2368, %subview_5[%arg3, %c3, %2367, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2369 = affine.apply affine_map<(d0) -> (d0 + 5)>(%arg4)
%2370 = vector.extract %2266[3, 5] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2370, %subview_5[%arg3, %c3, %2369, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2371 = affine.apply affine_map<(d0) -> (d0 + 6)>(%arg4)
%2372 = vector.extract %2266[3, 6] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2372, %subview_5[%arg3, %c3, %2371, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2373 = affine.apply affine_map<(d0) -> (d0 + 7)>(%arg4)
%2374 = vector.extract %2266[3, 7] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2374, %subview_5[%arg3, %c3, %2373, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2375 = affine.apply affine_map<(d0) -> (d0 + 8)>(%arg4)
%2376 = vector.extract %2266[3, 8] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2376, %subview_5[%arg3, %c3, %2375, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2377 = affine.apply affine_map<(d0) -> (d0 + 9)>(%arg4)
%2378 = vector.extract %2266[3, 9] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2378, %subview_5[%arg3, %c3, %2377, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2379 = affine.apply affine_map<(d0) -> (d0 + 10)>(%arg4)
%2380 = vector.extract %2266[3, 10] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2380, %subview_5[%arg3, %c3, %2379, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2381 = affine.apply affine_map<(d0) -> (d0 + 11)>(%arg4)
%2382 = vector.extract %2266[3, 11] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2382, %subview_5[%arg3, %c3, %2381, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2383 = affine.apply affine_map<(d0) -> (d0 + 12)>(%arg4)
%2384 = vector.extract %2266[3, 12] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2384, %subview_5[%arg3, %c3, %2383, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2385 = affine.apply affine_map<(d0) -> (d0 + 13)>(%arg4)
%2386 = vector.extract %2266[3, 13] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2386, %subview_5[%arg3, %c3, %2385, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2387 = affine.apply affine_map<(d0) -> (d0 + 14)>(%arg4)
%2388 = vector.extract %2266[3, 14] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2388, %subview_5[%arg3, %c3, %2387, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%2389 = affine.apply affine_map<(d0) -> (d0 + 15)>(%arg4)
%2390 = vector.extract %2266[3, 15] : vector<16xf16> from vector<4x16x16xf16>
vector.store %2390, %subview_5[%arg3, %c3, %2389, %c0] : memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
}
}
}
}
}
return
}
// -----// IR Dump After LLVMCPUVectorShapeCastLowering (iree-llvmcpu-vector-shape-cast-lowering) //----- //
func.func @turbine_llm_mmtfp_3d_8640_3200_f32f16_dispatch_1_broadcast_Dx8640x3200_f16_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%cst = arith.constant dense<0.000000e+00> : vector<4x16x16xf16>
%c63 = arith.constant 63 : index
%c62 = arith.constant 62 : index
%c61 = arith.constant 61 : index
%c60 = arith.constant 60 : index
%c59 = arith.constant 59 : index
%c58 = arith.constant 58 : index
%c57 = arith.constant 57 : index
%c56 = arith.constant 56 : index
%c55 = arith.constant 55 : index
%c54 = arith.constant 54 : index
%c53 = arith.constant 53 : index
%c52 = arith.constant 52 : index
%c51 = arith.constant 51 : index
%c50 = arith.constant 50 : index
%c49 = arith.constant 49 : index
%c48 = arith.constant 48 : index
%c47 = arith.constant 47 : index
%c46 = arith.constant 46 : index
%c45 = arith.constant 45 : index
%c44 = arith.constant 44 : index
%c43 = arith.constant 43 : index
%c42 = arith.constant 42 : index
%c41 = arith.constant 41 : index
%c40 = arith.constant 40 : index
%c39 = arith.constant 39 : index
%c38 = arith.constant 38 : index
%c37 = arith.constant 37 : index
%c36 = arith.constant 36 : index
%c35 = arith.constant 35 : index
%c34 = arith.constant 34 : index
%c33 = arith.constant 33 : index
%c32 = arith.constant 32 : index
%c31 = arith.constant 31 : index
%c30 = arith.constant 30 : index
%c29 = arith.constant 29 : index
%c28 = arith.constant 28 : index
%c27 = arith.constant 27 : index
%c26 = arith.constant 26 : index
%c25 = arith.constant 25 : index
%c24 = arith.constant 24 : index
%c23 = arith.constant 23 : index
%c22 = arith.constant 22 : index
%c21 = arith.constant 21 : index
%c20 = arith.constant 20 : index
%c19 = arith.constant 19 : index
%c18 = arith.constant 18 : index
%c17 = arith.constant 17 : index
%c15 = arith.constant 15 : index
%c14 = arith.constant 14 : index
%c13 = arith.constant 13 : index
%c12 = arith.constant 12 : index
%c11 = arith.constant 11 : index
%c10 = arith.constant 10 : index
%c9 = arith.constant 9 : index
%c8 = arith.constant 8 : index
%c7 = arith.constant 7 : index
%c6 = arith.constant 6 : index
%c5 = arith.constant 5 : index
%c4 = arith.constant 4 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c32_i64 = arith.constant 32 : i64
%c0 = arith.constant 0 : index
%c540 = arith.constant 540 : index
%c3200 = arith.constant 3200 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x64x16xf16>
%0 = hal.interface.constant.load[0] : i32
%1 = hal.interface.constant.load[1] : i32
%2 = hal.interface.constant.load[2] : i32
%3 = hal.interface.constant.load[3] : i32
%4 = arith.extui %0 : i32 to i64
%5 = arith.extui %1 : i32 to i64
%6 = arith.shli %5, %c32_i64 : i64
%7 = arith.ori %4, %6 : i64
%8 = arith.index_castui %7 : i64 to index
%9 = arith.extui %2 : i32 to i64
%10 = arith.extui %3 : i32 to i64
%11 = arith.shli %10, %c32_i64 : i64
%12 = arith.ori %9, %11 : i64
%13 = arith.index_castui %12 : i64 to index
%14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
memref.assume_alignment %14, 64 : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>>
%15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>{%13}
memref.assume_alignment %15, 1 : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%16 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_z]
%17 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_z]
%18 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%19 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
%20 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%21 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %16 to %13 step %17 {
%22 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 64)>(%arg0)[%13]
scf.for %arg1 = %18 to %c540 step %19 {
%23 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
scf.for %arg2 = %20 to %c3200 step %21 {
%subview = memref.subview %15[%arg0, %arg1, %arg2, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x540x3200x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%subview_0 = memref.subview %14[%23, %arg2] [64, 64] [1, 1] : memref<8640x3200xf16, #hal.descriptor_type<storage_buffer>> to memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
scf.for %arg3 = %c0 to %22 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c16 {
%24 = vector.load %subview_0[%c0, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%25 = vector.load %subview_0[%c1, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%26 = vector.load %subview_0[%c2, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%27 = vector.load %subview_0[%c3, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%28 = vector.load %subview_0[%c4, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%29 = vector.load %subview_0[%c5, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%30 = vector.load %subview_0[%c6, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%31 = vector.load %subview_0[%c7, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%32 = vector.load %subview_0[%c8, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%33 = vector.load %subview_0[%c9, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%34 = vector.load %subview_0[%c10, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%35 = vector.load %subview_0[%c11, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%36 = vector.load %subview_0[%c12, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%37 = vector.load %subview_0[%c13, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%38 = vector.load %subview_0[%c14, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%39 = vector.load %subview_0[%c15, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%40 = vector.load %subview_0[%c16, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%41 = vector.load %subview_0[%c17, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%42 = vector.load %subview_0[%c18, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%43 = vector.load %subview_0[%c19, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%44 = vector.load %subview_0[%c20, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%45 = vector.load %subview_0[%c21, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%46 = vector.load %subview_0[%c22, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%47 = vector.load %subview_0[%c23, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%48 = vector.load %subview_0[%c24, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%49 = vector.load %subview_0[%c25, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%50 = vector.load %subview_0[%c26, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%51 = vector.load %subview_0[%c27, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%52 = vector.load %subview_0[%c28, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%53 = vector.load %subview_0[%c29, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%54 = vector.load %subview_0[%c30, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%55 = vector.load %subview_0[%c31, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%56 = vector.load %subview_0[%c32, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%57 = vector.load %subview_0[%c33, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%58 = vector.load %subview_0[%c34, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%59 = vector.load %subview_0[%c35, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%60 = vector.load %subview_0[%c36, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%61 = vector.load %subview_0[%c37, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%62 = vector.load %subview_0[%c38, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%63 = vector.load %subview_0[%c39, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%64 = vector.load %subview_0[%c40, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%65 = vector.load %subview_0[%c41, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%66 = vector.load %subview_0[%c42, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%67 = vector.load %subview_0[%c43, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%68 = vector.load %subview_0[%c44, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%69 = vector.load %subview_0[%c45, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%70 = vector.load %subview_0[%c46, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%71 = vector.load %subview_0[%c47, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%72 = vector.load %subview_0[%c48, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%73 = vector.load %subview_0[%c49, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%74 = vector.load %subview_0[%c50, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%75 = vector.load %subview_0[%c51, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%76 = vector.load %subview_0[%c52, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%77 = vector.load %subview_0[%c53, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%78 = vector.load %subview_0[%c54, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%79 = vector.load %subview_0[%c55, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%80 = vector.load %subview_0[%c56, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%81 = vector.load %subview_0[%c57, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%82 = vector.load %subview_0[%c58, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%83 = vector.load %subview_0[%c59, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%84 = vector.load %subview_0[%c60, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%85 = vector.load %subview_0[%c61, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%86 = vector.load %subview_0[%c62, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%87 = vector.load %subview_0[%c63, %arg4] : memref<64x64xf16, strided<[3200, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<16xf16>
%subview_1 = memref.subview %alloca[0, 0, 0] [1, 64, 16] [1, 1, 1] : memref<1x64x16xf16> to memref<64x16xf16>
vector.store %24, %subview_1[%c0, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %25, %subview_1[%c1, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %26, %subview_1[%c2, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %27, %subview_1[%c3, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %28, %subview_1[%c4, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %29, %subview_1[%c5, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %30, %subview_1[%c6, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %31, %subview_1[%c7, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %32, %subview_1[%c8, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %33, %subview_1[%c9, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %34, %subview_1[%c10, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %35, %subview_1[%c11, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %36, %subview_1[%c12, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %37, %subview_1[%c13, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %38, %subview_1[%c14, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %39, %subview_1[%c15, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %40, %subview_1[%c16, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %41, %subview_1[%c17, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %42, %subview_1[%c18, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %43, %subview_1[%c19, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %44, %subview_1[%c20, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %45, %subview_1[%c21, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %46, %subview_1[%c22, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %47, %subview_1[%c23, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %48, %subview_1[%c24, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %49, %subview_1[%c25, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %50, %subview_1[%c26, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %51, %subview_1[%c27, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %52, %subview_1[%c28, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %53, %subview_1[%c29, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %54, %subview_1[%c30, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %55, %subview_1[%c31, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %56, %subview_1[%c32, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %57, %subview_1[%c33, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %58, %subview_1[%c34, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %59, %subview_1[%c35, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %60, %subview_1[%c36, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %61, %subview_1[%c37, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %62, %subview_1[%c38, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %63, %subview_1[%c39, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %64, %subview_1[%c40, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %65, %subview_1[%c41, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %66, %subview_1[%c42, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %67, %subview_1[%c43, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %68, %subview_1[%c44, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %69, %subview_1[%c45, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %70, %subview_1[%c46, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %71, %subview_1[%c47, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %72, %subview_1[%c48, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %73, %subview_1[%c49, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %74, %subview_1[%c50, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %75, %subview_1[%c51, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %76, %subview_1[%c52, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %77, %subview_1[%c53, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %78, %subview_1[%c54, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %79, %subview_1[%c55, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %80, %subview_1[%c56, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %81, %subview_1[%c57, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %82, %subview_1[%c58, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %83, %subview_1[%c59, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %84, %subview_1[%c60, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %85, %subview_1[%c61, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %86, %subview_1[%c62, %c0] : memref<64x16xf16>, vector<16xf16>
vector.store %87, %subview_1[%c63, %c0] : memref<64x16xf16>, vector<16xf16>
%expand_shape = memref.expand_shape %alloca [[0], [1, 2], [3, 4]] : memref<1x64x16xf16> into memref<1x4x16x16x1xf16>
%subview_2 = memref.subview %expand_shape[0, 0, 0, 0, 0] [1, 4, 16, 16, 1] [1, 1, 1, 1, 1] : memref<1x4x16x16x1xf16> to memref<1x4x16x16xf16, strided<[1024, 256, 16, 1]>>
%subview_3 = memref.subview %subview_2[0, 0, 0, 0] [1, 4, 16, 16] [1, 1, 1, 1] : memref<1x4x16x16xf16, strided<[1024, 256, 16, 1]>> to memref<4x16x16xf16>
%88 = vector.load %subview_3[%c0, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%89 = vector.load %subview_3[%c0, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%90 = vector.load %subview_3[%c0, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%91 = vector.load %subview_3[%c0, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%92 = vector.load %subview_3[%c0, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%93 = vector.load %subview_3[%c0, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%94 = vector.load %subview_3[%c0, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%95 = vector.load %subview_3[%c0, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%96 = vector.load %subview_3[%c0, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%97 = vector.load %subview_3[%c0, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%98 = vector.load %subview_3[%c0, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%99 = vector.load %subview_3[%c0, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%100 = vector.load %subview_3[%c0, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%101 = vector.load %subview_3[%c0, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%102 = vector.load %subview_3[%c0, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%103 = vector.load %subview_3[%c0, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%104 = vector.load %subview_3[%c1, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%105 = vector.load %subview_3[%c1, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%106 = vector.load %subview_3[%c1, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%107 = vector.load %subview_3[%c1, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%108 = vector.load %subview_3[%c1, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%109 = vector.load %subview_3[%c1, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%110 = vector.load %subview_3[%c1, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%111 = vector.load %subview_3[%c1, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%112 = vector.load %subview_3[%c1, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%113 = vector.load %subview_3[%c1, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%114 = vector.load %subview_3[%c1, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%115 = vector.load %subview_3[%c1, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%116 = vector.load %subview_3[%c1, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%117 = vector.load %subview_3[%c1, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%118 = vector.load %subview_3[%c1, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%119 = vector.load %subview_3[%c1, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%120 = vector.load %subview_3[%c2, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%121 = vector.load %subview_3[%c2, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%122 = vector.load %subview_3[%c2, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%123 = vector.load %subview_3[%c2, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%124 = vector.load %subview_3[%c2, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%125 = vector.load %subview_3[%c2, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%126 = vector.load %subview_3[%c2, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%127 = vector.load %subview_3[%c2, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%128 = vector.load %subview_3[%c2, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%129 = vector.load %subview_3[%c2, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%130 = vector.load %subview_3[%c2, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%131 = vector.load %subview_3[%c2, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%132 = vector.load %subview_3[%c2, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%133 = vector.load %subview_3[%c2, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%134 = vector.load %subview_3[%c2, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%135 = vector.load %subview_3[%c2, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%136 = vector.load %subview_3[%c3, %c0, %c0] : memref<4x16x16xf16>, vector<16xf16>
%137 = vector.load %subview_3[%c3, %c1, %c0] : memref<4x16x16xf16>, vector<16xf16>
%138 = vector.load %subview_3[%c3, %c2, %c0] : memref<4x16x16xf16>, vector<16xf16>
%139 = vector.load %subview_3[%c3, %c3, %c0] : memref<4x16x16xf16>, vector<16xf16>
%140 = vector.load %subview_3[%c3, %c4, %c0] : memref<4x16x16xf16>, vector<16xf16>
%141 = vector.load %subview_3[%c3, %c5, %c0] : memref<4x16x16xf16>, vector<16xf16>
%142 = vector.load %subview_3[%c3, %c6, %c0] : memref<4x16x16xf16>, vector<16xf16>
%143 = vector.load %subview_3[%c3, %c7, %c0] : memref<4x16x16xf16>, vector<16xf16>
%144 = vector.load %subview_3[%c3, %c8, %c0] : memref<4x16x16xf16>, vector<16xf16>
%145 = vector.load %subview_3[%c3, %c9, %c0] : memref<4x16x16xf16>, vector<16xf16>
%146 = vector.load %subview_3[%c3, %c10, %c0] : memref<4x16x16xf16>, vector<16xf16>
%147 = vector.load %subview_3[%c3, %c11, %c0] : memref<4x16x16xf16>, vector<16xf16>
%148 = vector.load %subview_3[%c3, %c12, %c0] : memref<4x16x16xf16>, vector<16xf16>
%149 = vector.load %subview_3[%c3, %c13, %c0] : memref<4x16x16xf16>, vector<16xf16>
%150 = vector.load %subview_3[%c3, %c14, %c0] : memref<4x16x16xf16>, vector<16xf16>
%151 = vector.load %subview_3[%c3, %c15, %c0] : memref<4x16x16xf16>, vector<16xf16>
%subview_4 = memref.subview %subview[0, 0, 0, 0, 0] [%22, 4, 64, 16, 1] [1, 1, 1, 1, 1] : memref<?x4x64x16x1xf16, strided<[27648000, 51200, 16, 1, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<?x4x64x16xf16, strided<[27648000, 51200, 16, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
%152 = vector.extract %88[0] : f16 from vector<16xf16>
%153 = vector.insert %152, %cst [0, 0, 0] : f16 into vector<4x16x16xf16>
%154 = vector.extract %89[0] : f16 from vector<16xf16>
%155 = vector.insert %154, %153 [0, 0, 1] : f16 into vector<4x16x16xf16>
%156 = vector.extract %90[0] : f16 from vector<16xf16>
%157 = vector.insert %156, %155 [0, 0, 2] : f16 into vector<4x16x16xf16>
%158 = vector.extract %91[0] : f16 from vector<16xf16>
%159 = vector.insert %158, %157 [0, 0, 3] : f16 into vector<4x16x16xf16>
%160 = vector.extract %92[0] : f16 from vector<16xf16>
%161 = vector.insert %160, %159 [0, 0, 4] : f16 into vector<4x16x16xf16>
%162 = vector.extract %93[0] : f16 from vector<16xf16>
%163 = vector.insert %162, %161 [0, 0, 5] : f16 into vector<4x16x16xf16>
%164 = vector.extract %94[0] : f16 from vector<16xf16>
%165 = vector.insert %164, %163 [0, 0, 6] : f16 into vector<4x16x16xf16>
%166 = vector.extract %95[0] : f16 from vector<16xf16>
%167 = vector.insert %166, %165 [0, 0, 7] : f16 into vector<4x16x16xf16>
%168 = vector.extract %96[0] : f16 from vector<16xf16>
%169 = vector.insert %168, %167 [0, 0, 8] : f16 into vector<4x16x16xf16>
%170 = vector.extract %97[0] : f16 from vector<16xf16>
%171 = vector.insert %170, %169 [0, 0, 9] : f16 into vector<4x16x16xf16>
%172 = vector.extract %98[0] : f16 from vector<16xf16>
%173 = vector.insert %172, %171 [0, 0, 10] : f16 into vector<4x16x16xf16>
%174 = vector.extract %99[0] : f16 from vector<16xf16>
%175 = vector.insert %174, %173 [0, 0, 11] : f16 into vector<4x16x16xf16>
%176 = vector.extract %100[0] : f16 from vector<16xf16>
%177 = vector.insert %176, %175 [0, 0, 12] : f16 into vector<4x16x16xf16>
%178 = vector.extract %101[0] : f16 from vector<16xf16>
%179 = vector.insert %178, %177 [0, 0, 13] : f16 into vector<4x16x16xf16>
%180 = vector.extract %102[0] : f16 from vector<16xf16>
%181 = vector.insert %180, %179 [0, 0, 14] : f16 into vector<4x16x16xf16>
%182 = vector.extract %103[0] : f16 from vector<16xf16>
%183 = vector.insert %182, %181 [0, 0, 15] : f16 into vector<4x16x16xf16>
%184 = vector.extract %88[1] : f16 from vector<16xf16>
%185 = vector.insert %184, %183 [0, 1, 0] : f16 into vector<4x16x16xf16>
%186 = vector.extract %89[1] : f16 from vector<16xf16>
%187 = vector.insert %186, %185 [0, 1, 1] : f16 into vector<4x16x16xf16>
%188 = vector.extract %90[1] : f16 from vector<16xf16>
%189 = vector.insert %188, %187 [0, 1, 2] : f16 into vector<4x16x16xf16>
%190 = vector.extract %91[1] : f16 from vector<16xf16>
%191 = vector.insert %190, %189 [0, 1, 3] : f16 into vector<4x16x16xf16>
%192 = vector.extract %92[1] : f16 from vector<16xf16>
%193 = vector.insert %192, %191 [0, 1, 4] : f16 into vector<4x16x16xf16>
%194 = vector.extract %93[1] : f16 from vector<16xf16>
%195 = vector.insert %194, %193 [0, 1, 5] : f16 into vector<4x16x16xf16>
%196 = vector.extract %94[1] : f16 from vector<16xf16>
%197 = vector.insert %196, %195 [0, 1, 6] : f16 into vector<4x16x16xf16>
%198 = vector.extract %95[1] : f16 from vector<16xf16>
%199 = vector.insert %198, %197 [0, 1, 7] : f16 into vector<4x16x16xf16>
%200 = vector.extract %96[1] : f16 from vector<16xf16>
%201 = vector.insert %200, %199 [0, 1, 8] : f16 into vector<4x16x16xf16>
%202 = vector.extract %97[1] : f16 from vector<16xf16>
%203 = vector.insert %202, %201 [0, 1, 9] : f16 into vector<4x16x16xf16>
%204 = vector.extract %98[1] : f16 from vector<16xf16>
%205 = vector.insert %204, %203 [0, 1, 10] : f16 into vector<4x16x16xf16>
%206 = vector.extract %99[1] : f16 from vector<16xf16>
%207 = vector.insert %206, %205 [0, 1, 11] : f16 into vector<4x16x16xf16>
%208 = vector.extract %100[1] : f16 from vector<16xf16>
%209 = vector.insert %208, %207 [0, 1, 12] : f16 into vector<4x16x16xf16>
%210 = vector.extract %101[1] : f16 from vector<16xf16>
%211 = vector.insert %210, %209 [0, 1, 13] : f16 into vector<4x16x16xf16>
%212 = vector.extract %102[1] : f16 from vector<16xf16>
%213 = vector.insert %212, %211 [0, 1, 14] : f16 into vector<4x16x16xf16>
%214 = vector.extract %103[1] : f16 from vector<16xf16>
%215 = vector.insert %214, %213 [0, 1, 15] : f16 into vector<4x16x16xf16>
%216 = vector.extract %88[2] : f16 from vector<16xf16>
%217 = vector.insert %216, %215 [0, 2, 0] : f16 into vector<4x16x16xf16>
%218 = vector.extract %89[2] : f16 from vector<16xf16>
%219 = vector.insert %218, %217 [0, 2, 1] : f16 into vector<4x16x16xf16>
%220 = vector.extract %90[2] : f16 from vector<16xf16>
%221 = vector.insert %220, %219 [0, 2, 2] : f16 into vector<4x16x16xf16>
%222 = vector.extract %91[2] : f16 from vector<16xf16>
%223 = vector.insert %222, %221 [0, 2, 3] : f16 into vector<4x16x16xf16>
%224 = vector.extract %92[2] : f16 from vector<16xf16>
%225 = vector.insert %224, %223 [0, 2, 4] : f16 into vector<4x16x16xf16>
%226 = vector.extract %93[2] : f16 from vector<16xf16>
%227 = vector.insert %226, %225 [0, 2, 5] : f16 into vector<4x16x16xf16>
%228 = vector.extract %94[2] : f16 from vector<16xf16>
%229 = vector.insert %228, %227 [0, 2, 6] : f16 into vector<4x16x16xf16>
%230 = vector.extract %95[2] : f16 from vector<16xf16>
%231 = vector.insert %230, %229 [0, 2, 7] : f16 into vector<4x16x16xf16>
%232 = vector.extract %96[2] : f16 from vector<16xf16>
%233 = vector.insert %232, %231 [0, 2, 8] : f16 into vector<4x16x16xf16>
%234 = vector.extract %97[2] : f16 from vector<16xf16>
%235 = vector.insert %234, %233 [0, 2, 9] : f16 into vector<4x16x16xf16>
%236 = vector.extract %98[2] : f16 from vector<16xf16>
%237 = vector.insert %236, %235 [0, 2, 10] : f16 into vector<4x16x16xf16>
%238 = vector.extract %99[2] : f16 from vector<16xf16>
%239 = vector.insert %238, %237 [0, 2, 11] : f16 into vector<4x16x16xf16>
%240 = vector.extract %100[2] : f16 from vector<16xf16>
%241 = vector.insert %240, %239 [0, 2, 12] : f16 into vector<4x16x16xf16>
%242 = vector.extract %101[2] : f16 from vector<16xf16>
%243 = vector.insert %242, %241 [0, 2, 13] : f16 into vector<4x16x16xf16>
%244 = vector.extract %102[2] : f16 from vector<16xf16>
%245 = vector.insert %244, %243 [0, 2, 14] : f16 into vector<4x16x16xf16>
%246 = vector.extract %103[2] : f16 from vector<16xf16>
%247 = vector.insert %246, %245 [0, 2, 15] : f16 into vector<4x16x16xf16>
%248 = vector.extract %88[3] : f16 from vector<16xf16>
%249 = vector.insert %248, %247 [0, 3, 0] : f16 into vector<4x16x16xf16>
%250 = vector.extract %89[3] : f16 from vector<16xf16>
%251 = vector.insert %250, %249 [0, 3, 1] : f16 into vector<4x16x16xf16>
%252 = vector.extract %90[3] : f16 from vector<16xf16>
%253 = vector.insert %252, %251 [0, 3, 2] : f16 into vector<4x16x16xf16>
%254 = vector.extract %91[3] : f16 from vector<16xf16>
%255 = vector.insert %254, %253 [0, 3, 3] : f16 into vector<4x16x16xf16>
%256 = vector.extract %92[3] : f16 from vector<16xf16>
%257 = vector.insert %256, %255 [0, 3, 4] : f16 into vector<4x16x16xf16>
%258 = vector.extract %93[3] : f16 from vector<16xf16>
%259 = vector.insert %258, %257 [0, 3, 5] : f16 into vector<4x16x16xf16>
%260 = vector.extract %94[3] : f16 from vector<16xf16>
%261 = vector.insert %260, %259 [0, 3, 6] : f16 into vector<4x16x16xf16>
%262 = vector.extract %95[3] : f16 from vector<16xf16>
%263 = vector.insert %262, %261 [0, 3, 7] : f16 into vector<4x16x16xf16>
%264 = vector.extract %96[3] : f16 from vector<16xf16>
%265 = vector.insert %264, %263 [0, 3, 8] : f16 into vector<4x16x16xf16>
%266 = vector.extract %97[3] : f16 from vector<16xf16>
%267 = vector.insert %266, %265 [0, 3, 9] : f16 into vector<4x16x16xf16>
%268 = vector.extract %98[3] : f16 from vector<16xf16>
%269 = vector.insert %268, %267 [0, 3, 10] : f16 into vector<4x16x16xf16>
%270 = vector.extract %99[3] : f16 from vector<16xf16>
%271 = vector.insert %270, %269 [0, 3, 11] : f16 into vector<4x16x16xf16>
%272 = vector.extract %100[3] : f16 from vector<16xf16>
%273 = vector.insert %272, %271 [0, 3, 12] : f16 into vector<4x16x16xf16>
%274 = vector.extract %101[3] : f16 from vector<16xf16>
%275 = vector.insert %274, %273 [0, 3, 13] : f16 into vector<4x16x16xf16>
%276 = vector.extract %102[3] : f16 from vector<16xf16>
%277 = vector.insert %276, %275 [0, 3, 14] : f16 into vector<4x16x16xf16>
%278 = vector.extract %103[3] : f16 from vector<16xf16>
%279 = vector.insert %278, %277 [0, 3, 15] : f16 into vector<4x16x16xf16>
%280 = vector.extract %88[4] : f16 from vector<16xf16>
%281 = vector.insert %280, %279 [0, 4, 0] : f16 into vector<4x16x16xf16>
%282 = vector.extract %89[4] : f16 from vector<16xf16>
%283 = vector.insert %282, %281 [0, 4, 1] : f16 into vector<4x16x16xf16>
%284 = vector.extract %90[4] : f16 from vector<16xf16>
%285 = vector.insert %284, %283 [0, 4, 2] : f16 into vector<4x16x16xf16>
%286 = vector.extract %91[4] : f16 from vector<16xf16>
%287 = vector.insert %286, %285 [0, 4, 3] : f16 into vector<4x16x16xf16>
%288 = vector.extract %92[4] : f16 from vector<16xf16>
%289 = vector.insert %288, %287 [0, 4, 4] : f16 into vector<4x16x16xf16>
%290 = vector.extract %93[4] : f16 from vector<16xf16>
%291 = vector.insert %290, %289 [0, 4, 5] : f16 into vector<4x16x16xf16>
%292 = vector.extract %94[4] : f16 from vector<16xf16>
%293 = vector.insert %292, %291 [0, 4, 6] : f16 into vector<4x16x16xf16>
%294 = vector.extract %95[4] : f16 from vector<16xf16>
%295 = vector.insert %294, %293 [0, 4, 7] : f16 into vector<4x16x16xf16>
%296 = vector.extract %96[4] : f16 from vector<16xf16>
%297 = vector.insert %296, %295 [0, 4, 8] : f16 into vector<4x16x16xf16>
%298 = vector.extract %97[4] : f16 from vector<16xf16>
%299 = vector.insert %298, %297 [0, 4, 9] : f16 into vector<4x16x16xf16>
%300 = vector.extract %98[4] : f16 from vector<16xf16>
%301 = vector.insert %300, %299 [0, 4, 10] : f16 into vector<4x16x16xf16>
%302 = vector.extract %99[4] : f16 from vector<16xf16>
%303 = vector.insert %302, %301 [0, 4, 11] : f16 into vector<4x16x16xf16>
%304 = vector.extract %100[4] : f16 from vector<16xf16>
%305 = vector.insert %304, %303 [0, 4, 12] : f16 into vector<4x16x16xf16>
%306 = vector.extract %101[4] : f16 from vector<16xf16>
%307 = vector.insert %306, %305 [0, 4, 13] : f16 into vector<4x16x16xf16>
%308 = vector.extract %102[4] : f16 from vector<16xf16>
%309 = vector.insert %308, %307 [0, 4, 14] : f16 into vector<4x16x16xf16>
%310 = vector.extract %103[4] : f16 from vector<16xf16>
%311 = vector.insert %310, %309 [0, 4, 15] : f16 into vector<4x16x16xf16>
%312 = vector.extract %88[5] : f16 from vector<16xf16>
%313 = vector.insert %312, %311 [0, 5, 0] : f16 into vector<4x16x16xf16>
%314 = vector.extract %89[5] : f16 from vector<16xf16>
%315 = vector.insert %314, %313 [0, 5, 1] : f16 into vector<4x16x16xf16>
%316 = vector.extract %90[5] : f16 from vector<16xf16>
%317 = vector.insert %316, %315 [0, 5, 2] : f16 into vector<4x16x16xf16>
%318 = vector.extract %91[5] : f16 from vector<16xf16>
%319 = vector.insert %318, %317 [0, 5, 3] : f16 into vector<4x16x16xf16>
%320 = vector.extract %92[5] : f16 from vector<16xf16>
%321 = vector.insert %320, %319 [0, 5, 4] : f16 into vector<4x16x16xf16>
%322 = vector.extract %93[5] : f16 from vector<16xf16>
%323 = vector.insert %322, %321 [0, 5, 5] : f16 into vector<4x16x16xf16>
%324 = vector.extract %94[5] : f16 from vector<16xf16>
%325 = vector.insert %324, %323 [0, 5, 6] : f16 into vector<4x16x16xf16>
%326 = vector.extract %95[5] : f16 from vector<16xf16>
%327 = vector.insert %326, %325 [0, 5, 7] : f16 into vector<4x16x16xf16>
%328 = vector.extract %96[5] : f16 from vector<16xf16>
%329 = vector.insert %328, %327 [0, 5, 8] : f16 into vector<4x16x16xf16>
%330 = vector.extract %97[5] : f16 from vector<16xf16>
%331 = vector.insert %330, %329 [0, 5, 9] : f16 into vector<4x16x16xf16>
%332 = vector.extract %98[5] : f16 from vector<16xf16>
%333 = vector.insert %332, %331 [0, 5, 10] : f16 into vector<4x16x16xf16>
%334 = vector.extract %99[5] : f16 from vector<16xf16>
%335 = vector.insert %334, %333 [0, 5, 11] : f16 into vector<4x16x16xf16>
%336 = vector.extract %100[5] : f16 from vector<16xf16>
%337 = vector.insert %336, %335 [0, 5, 12] : f16 into vector<4x16x16xf16>
%338 = vector.extract %101[5] : f16 from vector<16xf16>
%339 = vector.insert %338, %337 [0, 5, 13] : f16 into vector<4x16x16xf16>
%340 = vector.extract %102[5] : f16 from vector<16xf16>
%341 = vector.insert %340, %339 [0, 5, 14] : f16 into vector<4x16x16xf16>
%342 = vector.extract %103[5] : f16 from vector<16xf16>
%343 = vector.insert %342, %341 [0, 5, 15] : f16 into vector<4x16x16xf16>
%344 = vector.extract %88[6] : f16 from vector<16xf16>
%345 = vector.insert %344, %343 [0, 6, 0] : f16 into vector<4x16x16xf16>
%346 = vector.extract %89[6] : f16 from vector<16xf16>
%347 = vector.insert %346, %345 [0, 6, 1] : f16 into vector<4x16x16xf16>
%348 = vector.extract %90[6] : f16 from vector<16xf16>
%349 = vector.insert %348, %347 [0, 6, 2] : f16 into vector<4x16x16xf16>
%350 = vector.extract %91[6] : f16 from vector<16xf16>
%351 = vector.insert %350, %349 [0, 6, 3] : f16 into vector<4x16x16xf16>
%352 = vector.extract %92[6] : f16 from vector<16xf16>
%353 = vector.insert %352, %351 [0, 6, 4] : f16 into vector<4x16x16xf16>
%354 = vector.extract %93[6] : f16 from vector<16xf16>
%355 = vector.insert %354, %353 [0, 6, 5] : f16 into vector<4x16x16xf16>
%356 = vector.extract %94[6] : f16 from vector<16xf16>
%357 = vector.insert %356, %355 [0, 6, 6] : f16 into vector<4x16x16xf16>
%358 = vector.extract %95[6] : f16 from vector<16xf16>
%359 = vector.insert %358, %357 [0, 6, 7] : f16 into vector<4x16x16xf16>
%360 = vector.extract %96[6] : f16 from vector<16xf16>
%361 = vector.insert %360, %359 [0, 6, 8] : f16 into vector<4x16x16xf16>
%362 = vector.extract %97[6] : f16 from vector<16xf16>
%363 = vector.insert %362, %361 [0, 6, 9] : f16 into vector<4x16x16xf16>
%364 = vector.extract %98[6] : f16 from vector<16xf16>
%365 = vector.insert %364, %363 [0, 6, 10] : f16 into vector<4x16x16xf16>
%366 = vector.extract %99[6] : f16 from vector<16xf16>
%367 = vector.insert %366, %365 [0, 6, 11] : f16 into vector<4x16x16xf16>
%368 = vector.extract %100[6] : f16 from vector<16xf16>
%369 = vector.insert %368, %367 [0, 6, 12] : f16 into vector<4x16x16xf16>
%370 = vector.extract %101[6] : f16 from vector<16xf16>
%371 = vector.insert %370, %369 [0, 6, 13] : f16 into vector<4x16x16xf16>
%372 = vector.extract %102[6] : f16 from vector<16xf16>
%373 = vector.insert %372, %371 [0, 6, 14] : f16 into vector<4x16x16xf16>
%374 = vector.extract %103[6] : f16 from vector<16xf16>
%375 = vector.insert %374, %373 [0, 6, 15] : f16 into vector<4x16x16xf16>
%376 = vector.extract %88[7] : f16 from vector<16xf16>
%377 = vector.insert %376, %375 [0, 7, 0] : f16 into vector<4x16x16xf16>
%378 = vector.extract %89[7] : f16 from vector<16xf16>
%379 = vector.insert %378, %377 [0, 7, 1] : f16 into vector<4x16x16xf16>
%380 = vector.extract %90[7] : f16 from vector<16xf16>
%381 = vector.insert %380, %379 [0, 7, 2] : f16 into vector<4x16x16xf16>
%382 = vector.extract %91[7] : f16 from vector<16xf16>
%383 = vector.insert %382, %381 [0, 7, 3] : f16 into vector<4x16x16xf16>
%384 = vector.extract %92[7] : f16 from vector<16xf16>
%385 = vector.insert %384, %383 [0, 7, 4] : f16 into vector<4x16x16xf16>
%386 = vector.extract %93[7] : f16 from vector<16xf16>
%387 = vector.insert %386, %385 [0, 7, 5] : f16 into vector<4x16x16xf16>
%388 = vector.extract %94[7] : f16 from vector<16xf16>
%389 = vector.insert %388, %387 [0, 7, 6] : f16 into vector<4x16x16xf16>
%390 = vector.extract %95[7] : f16 from vector<16xf16>
%391 = vector.insert %390, %389 [0, 7, 7] : f16 into vector<4x16x16xf16>
%392 = vector.extract %96[7] : f16 from vector<16xf16>
%393 = vector.insert %392, %391 [0, 7, 8] : f16 into vector<4x16x16xf16>
%394 = vector.extract %97[7] : f16 from vector<16xf16>
%395 = vector.insert %394, %393 [0, 7, 9] : f16 into vector<4x16x16xf16>
%396 = vector.extract %98[7] : f16 from vector<16xf16>
%397 = vector.insert %396, %395 [0, 7, 10] : f16 into vector<4x16x16xf16>
%398 = vector.extract %99[7] : f16 from vector<16xf16>
%399 = vector.insert %398, %397 [0, 7, 11] : f16 into vector<4x16x16xf16>
%400 = vector.extract %100[7] : f16 from vector<16xf16>
%401 = vector.insert %400, %399 [0, 7, 12] : f16 into vector<4x16x16xf16>
%402 = vector.extract %101[7] : f16 from vector<16xf16>
%403 = vector.insert %402, %401 [0, 7, 13] : f16 into vector<4x16x16xf16>
%404 = vector.extract %102[7] : f16 from vector<16xf16>
%405 = vector.insert %404, %403 [0, 7, 14] : f16 into vector<4x16x16xf16>
%406 = vector.extract %103[7] : f16 from vector<16xf16>
%407 = vector.insert %406, %405 [0, 7, 15] : f16 into vector<4x16x16xf16>
%408 = vector.extract %88[8] : f16 from vector<16xf16>
%409 = vector.insert %408, %407 [0, 8, 0] : f16 into vector<4x16x16xf16>
%410 = vector.extract %89[8] : f16 from vector<16xf16>
%411 = vector.insert %410, %409 [0, 8, 1] : f16 into vector<4x16x16xf16>
%412 = vector.extract %90[8] : f16 from vector<16xf16>
%413 = vector.insert %412, %411 [0, 8, 2] : f16 into vector<4x16x16xf16>
%414 = vector.extract %91[8] : f16 from vector<16xf16>
%415 = vector.insert %414, %413 [0, 8, 3] : f16 into vector<4x16x16xf16>
%416 = vector.extract %92[8] : f16 from vector<16xf16>
%417 = vector.insert %416, %415 [0, 8, 4] : f16 into vector<4x16x16xf16>
%418 = vector.extract %93[8] : f16 from vector<16xf16>
%419 = vector.insert %418, %417 [0, 8, 5] : f16 into vector<4x16x16xf16>
%420 = vector.extract %94[8] : f16 from vector<16xf16>
%421 = vector.insert %420, %419 [0, 8, 6] : f16 into vector<4x16x16xf16>
%422 = vector.extract %95[8] : f16 from vector<16xf16>
%423 = vector.insert %422, %421 [0, 8, 7] : f16 into vector<4x16x16xf16>
%424 = vector.extract %96[8] : f16 from vector<16xf16>
%425 = vector.insert %424, %423 [0, 8, 8] : f16 into vector<4x16x16xf16>
%426 = vector.extract %97[8] : f16 from vector<16xf16>
%427 = vector.insert %426, %425 [0, 8, 9] : f16 into vector<4x16x16xf16>
%428 = vector.extract %98[8] : f16 from vector<16xf16>
%429 = vector.insert %428, %427 [0, 8, 10] : f16 into vector<4x16x16xf16>
%430 = vector.extract %99[8] : f16 from vector<16xf16>
%431 = vector.insert %430, %429 [0, 8, 11] : f16 into vector<4x16x16xf16>
%432 = vector.extract %100[8] : f16 from vector<16xf16>
%433 = vector.insert %432, %431 [0, 8, 12] : f16 into vector<4x16x16xf16>
%434 = vector.extract %101[8] : f16 from vector<16xf16>
%435 = vector.insert %434, %433 [0, 8, 13] : f16 into vector<4x16x16xf16>
%436 = vector.extract %102[8] : f16 from vector<16xf16>
%437 = vector.insert %436, %435 [0, 8, 14] : f16 into vector<4x16x16xf16>
%438 = vector.extract %103[8] : f16 from vector<16xf16>
%439 = vector.insert %438, %437 [0, 8, 15] : f16 into vector<4x16x16xf16>
%440 = vector.extract %88[9] : f16 from vector<16xf16>
%441 = vector.insert %440, %439 [0, 9, 0] : f16 into vector<4x16x16xf16>
%442 = vector.extract %89[9] : f16 from vector<16xf16>
%443 = vector.insert %442, %441 [0, 9, 1] : f16 into vector<4x16x16xf16>
%444 = vector.extract %90[9] : f16 from vector<16xf16>
%445 = vector.insert %444, %443 [0, 9, 2] : f16 into vector<4x16x16xf16>
%446 = vector.extract %91[9] : f16 from vector<16xf16>
%447 = vector.insert %446, %445 [0, 9, 3] : f16 into vector<4x16x16xf16>
%448 = vector.extract %92[9] : f16 from vector<16xf16>
%449 = vector.insert %448, %447 [0, 9, 4] : f16 into vector<4x16x16xf16>
%450 = vector.extract %93[9] : f16 from vector<16xf16>
%451 = vector.insert %450, %449 [0, 9, 5] : f16 into vector<4x16x16xf16>
%452 = vector.extract %94[9] : f16 from vector<16xf16>
%453 = vector.insert %452, %451 [0, 9, 6] : f16 into vector<4x16x16xf16>
%454 = vector.extract %95[9] : f16 from vector<16xf16>
%455 = vector.insert %454, %453 [0, 9, 7] : f16 into vector<4x16x16xf16>
%456 = vector.extract %96[9] : f16 from vector<16xf16>
%457 = vector.insert %456, %455 [0, 9, 8] : f16 into vector<4x16x16xf16>
%458 = vector.extract %97[9] : f16 from vector<16xf16>
%459 = vector.insert %458, %457 [0, 9, 9] : f16 into vector<4x16x16xf16>
%460 = vector.extract %98[9] : f16 from vector<16xf16>
%461 = vector.insert %460, %459 [0, 9, 10] : f16 into vector<4x16x16xf16>
%462 = vector.extract %99[9] : f16 from vector<16xf16>
%463 = vector.insert %462, %461 [0, 9, 11] : f16 into vector<4x16x16xf16>
%464 = vector.extract %100[9] : f16 from vector<16xf16>
%465 = vector.insert %464, %463 [0, 9, 12] : f16 into vector<4x16x16xf16>
%466 = vector.extract %101[9] : f16 from vector<16xf16>
%467 = vector.insert %466, %465 [0, 9, 13] : f16 into vector<4x16x16xf16>
%468 = vector.extract %102[9] : f16 from vector<16xf16>
%469 = vector.insert %468, %467 [0, 9, 14] : f16 into vector<4x16x16xf16>
%470 = vector.extract %103[9] : f16 from vector<16xf16>
%471 = vector.insert %470, %469 [0, 9, 15] : f16 into vector<4x16x16xf16>
%472 = vector.extract %88[10] : f16 from vector<16xf16>
%473 = vector.insert %472, %471 [0, 10, 0] : f16 into vector<4x16x16xf16>
%474 = vector.extract %89[10] : f16 from vector<16xf16>
%475 = vector.insert %474, %473 [0, 10, 1] : f16 into vector<4x16x16xf16>
%476 = vector.extract %90[10] : f16 from vector<16xf16>
%477 = vector.insert %476, %475 [0, 10, 2] : f16 into vector<4x16x16xf16>
%478 = vector.extract %91[10] : f16 from vector<16xf16>
%479 = vector.insert %478, %477 [0, 10, 3] : f16 into vector<4x16x16xf16>
%480 = vector.extract %92[10] : f16 from vector<16xf16>
%481 = vector.insert %480, %479 [0, 10, 4] : f16 into vector<4x16x16xf16>
%482 = vector.extract %93[10] : f16 from vector<16xf16>
%483 = vector.insert %482, %481 [0, 10, 5] : f16 into vector<4x16x16xf16>
%484 = vector.extract %94[10] : f16 from vector<16xf16>
%485 = vector.insert %484, %483 [0, 10, 6] : f16 into vector<4x16x16xf16>
%486 = vector.extract %95[10] : f16 from vector<16xf16>
%487 = vector.insert %486, %485 [0, 10, 7] : f16 into vector<4x16x16xf16>
%488 = vector.extract %96[10] : f16 from vector<16xf16>
%489 = vector.insert %488, %487 [0, 10, 8] : f16 into vector<4x16x16xf16>
%490 = vector.extract %97[10] : f16 from vector<16xf16>
%491 = vector.insert %490, %489 [0, 10, 9] : f16 into vector<4x16x16xf16>
%492 = vector.extract %98[10] : f16 from vector<16xf16>
%493 = vector.insert %492, %491 [0, 10, 10] : f16 into vector<4x16x16xf16>
%494 = vector.extract %99[10] : f16 from vector<16xf16>
%495 = vector.insert %494, %493 [0, 10, 11] : f16 into vector<4x16x16xf16>
%496 = vector.extract %100[10] : f16 from vector<16xf16>
%497 = vector.insert %496, %495 [0, 10, 12] : f16 into vector<4x16x16xf16>
%498 = vector.extract %101[10] : f16 from vector<16xf16>
%499 = vector.insert %498, %497 [0, 10, 13] : f16 into vector<4x16x16xf16>
%500 = vector.extract %102[10] : f16 from vector<16xf16>
%501 = vector.insert %500, %499 [0, 10, 14] : f16 into vector<4x16x16xf16>
%502 = vector.extract %103[10] : f16 from vector<16xf16>
%503 = vector.insert %502, %501 [0, 10, 15] : f16 into vector<4x16x16xf16>
%504 = vector.extract %88[11] : f16 from vector<16xf16>
%505 = vector.insert %504, %503 [0, 11, 0] : f16 into vector<4x16x16xf16>
%506 = vector.extract %89[11] : f16 from vector<16xf16>
%507 = vector.insert %506, %505 [0, 11, 1] : f16 into vector<4x16x16xf16>
%508 = vector.extract %90[11] : f16 from vector<16xf16>
%509 = vector.insert %508, %507 [0, 11, 2] : f16 into vector<4x16x16xf16>
%510 = vector.extract %91[11] : f16 from vector<16xf16>
%511 = vector.insert %510, %509 [0, 11, 3] : f16 into vector<4x16x16xf16>
%512 = vector.extract %92[11] : f16 from vector<16xf16>
%513 = vector.insert %512, %511 [0, 11, 4] : f16 into vector<4x16x16xf16>
%514 = vector.extract %93[11] : f16 from vector<16xf16>
%515 = vector.insert %514, %513 [0, 11, 5] : f16 into vector<4x16x16xf16>
%516 = vector.extract %94[11] : f16 from vector<16xf16>
%517 = vector.insert %516, %515 [0, 11, 6] : f16 into vector<4x16x16xf16>
%518 = vector.extract %95[11] : f16 from vector<16xf16>
%519 = vector.insert %518, %517 [0, 11, 7] : f16 into vector<4x16x16xf16>
%520 = vector.extract %96[11] : f16 from vector<16xf16>
%521 = vector.insert %520, %519 [0, 11, 8] : f16 into vector<4x16x16xf16>
%522 = vector.extract %97[11] : f16 from vector<16xf16>
%523 = vector.insert %522, %521 [0, 11, 9] : f16 into vector<4x16x16xf16>
%524 = vector.extract %98[11] : f16 from vector<16xf16>
%525 = vector.insert %524, %523 [0, 11, 10] : f16 into vector<4x16x16xf16>
%526 = vector.extract %99[11] : f16 from vector<16xf16>
%527 = vector.insert %526, %525 [0, 11, 11] : f16 into vector<4x16x16xf16>
%528 = vector.extract %100[11] : f16 from vector<16xf16>
%529 = vector.insert %528, %527 [0, 11, 12] : f16 into vector<4x16x16xf16>
%530 = vector.extract %101[11] : f16 from vector<16xf16>
%531 = vector.insert %530, %529 [0, 11, 13] : f16 into vector<4x16x16xf16>
%532 = vector.extract %102[11] : f16 from vector<16xf16>
%533 = vector.insert %532, %531 [0, 11, 14] : f16 into vector<4x16x16xf16>
%534 = vector.extract %103[11] : f16 from vector<16xf16>
%535 = vector.insert %534, %533 [0, 11, 15] : f16 into vector<4x16x16xf16>
%536 = vector.extract %88[12] : f16 from vector<16xf16>
%537 = vector.insert %536, %535 [0, 12, 0] : f16 into vector<4x16x16xf16>
%538 = vector.extract %89[12] : f16 from vector<16xf16>
%539 = vector.insert %538, %537 [0, 12, 1] : f16 into vector<4x16x16xf16>
%540 = vector.extract %90[12] : f16 from vector<16xf16>
%541 = vector.insert %540, %539 [0, 12, 2] : f16 into vector<4x16x16xf16>
%542 = vector.extract %91[12] : f16 from vector<16xf16>
%543 = vector.insert %542, %541 [0, 12, 3] : f16 into vector<4x16x16xf16>
%544 = vector.extract %92[12] : f16 from vector<16xf16>
%545 = vector.insert %544, %543 [0, 12, 4] : f16 into vector<4x16x16xf16>
%546 = vector.extract %93[12] : f16 from vector<16xf16>
%547 = vector.insert %546, %545 [0, 12, 5] : f16 into vector<4x16x16xf16>
%548 = vector.extract %94[12] : f16 from vector<16xf16>
%549 = vector.insert %548, %547 [0, 12, 6] : f16 into vector<4x16x16xf16>
%550 = vector.extract %95[12] : f16 from vector<16xf16>
%551 = vector.insert %550, %549 [0, 12, 7] : f16 into vector<4x16x16xf16>
%552 = vector.extract %96[12] : f16 from vector<16xf16>
%553 = vector.insert %552, %551 [0, 12, 8] : f16 into vector<4x16x16xf16>
%554 = vector.extract %97[12] : f16 from vector<16xf16>
%555 = vector.insert %554, %553 [0, 12, 9] : f16 into vector<4x16x16xf16>
%556 = vector.extract %98[12] : f16 from vector<16xf16>
%557 = vector.insert %556, %555 [0, 12, 10] : f16 into vector<4x16x16xf16>
%558 = vector.extract %99[12] : f16 from vector<16xf16>
%559 = vector.insert %558, %557 [0, 12, 11] : f16 into vector<4x16x16xf16>
%560 = vector.extract %100[12] : f16 from vector<16xf16>
%561 = vector.insert %560, %559 [0, 12, 12] : f16 into vector<4x16x16xf16>
%562 = vector.extract %101[12] : f16 from vector<16xf16>
%563 = vector.insert %562, %561 [0, 12, 13] : f16 into vector<4x16x16xf16>
%564 = vector.extract %102[12] : f16 from vector<16xf16>
%565 = vector.insert %564, %563 [0, 12, 14] : f16 into vector<4x16x16xf16>
%566 = vector.extract %103[12] : f16 from vector<16xf16>
%567 = vector.insert %566, %565 [0, 12, 15] : f16 into vector<4x16x16xf16>
%568 = vector.extract %88[13] : f16 from vector<16xf16>
%569 = vector.insert %568, %567 [0, 13, 0] : f16 into vector<4x16x16xf16>
%570 = vector.extract %89[13] : f16 from vector<16xf16>
%571 = vector.insert %570, %569 [0, 13, 1] : f16 into vector<4x16x16xf16>
%572 = vector.extract %90[13] : f16 from vector<16xf16>
%573 = vector.insert %572, %571 [0, 13, 2] : f16 into vector<4x16x16xf16>
%574 = vector.extract %91[13] : f16 from vector<16xf16>
%575 = vector.insert %574, %573 [0, 13, 3] : f16 into vector<4x16x16xf16>
%576 = vector.extract %92[13] : f16 from vector<16xf16>
%577 = vector.insert %576, %575 [0, 13, 4] : f16 into vector<4x16x16xf16>
%578 = vector.extract %93[13] : f16 from vector<16xf16>
%579 = vector.insert %578, %577 [0, 13, 5] : f16 into vector<4x16x16xf16>
%580 = vector.extract %94[13] : f16 from vector<16xf16>
%581 = vector.insert %580, %579 [0, 13, 6] : f16 into vector<4x16x16xf16>
%582 = vector.extract %95[13] : f16 from vector<16xf16>
%583 = vector.insert %582, %581 [0, 13, 7] : f16 into vector<4x16x16xf16>
%584 = vector.extract %96[13] : f16 from vector<16xf16>
%585 = vector.insert %584, %583 [0, 13, 8] : f16 into vector<4x16x16xf16>
%586 = vector.extract %97[13] : f16 from vector<16xf16>
%587 = vector.insert %586, %585 [0, 13, 9] : f16 into vector<4x16x16xf16>
%588 = vector.extract %98[13] : f16 from vector<16xf16>
%589 = vector.insert %588, %587 [0, 13, 10] : f16 into vector<4x16x16xf16>
%590 = vector.extract %99[13] : f16 from vector<16xf16>
%591 = vector.insert %590, %589 [0, 13, 11] : f16 into vector<4x16x16xf16>
%592 = vector.extract %100[13] : f16 from vector<16xf16>
%593 = vector.insert %592, %591 [0, 13, 12] : f16 into vector<4x16x16xf16>
%594 = vector.extract %101[13] : f16 from vector<16xf16>
%595 = vector.insert %594, %593 [0, 13, 13] : f16 into vector<4x16x16xf16>
%596 = vector.extract %102[13] : f16 from vector<16xf16>
%597 = vector.insert %596, %595 [0, 13, 14] : f16 into vector<4x16x16xf16>
%598 = vector.extract %103[13] : f16 from vector<16xf16>
%599 = vector.insert %598, %597 [0, 13, 15] : f16 into vector<4x16x16xf16>
%600 = vector.extract %88[14] : f16 from vector<16xf16>
%601 = vector.insert %600, %599 [0, 14, 0] : f16 into vector<4x16x16xf16>
%602 = vector.extract %89[14] : f16 from vector<16xf16>
%603 = vector.insert %602, %601 [0, 14, 1] : f16 into vector<4x16x16xf16>
%604 = vector.extract %90[14] : f16 from vector<16xf16>
%605 = vector.insert %604, %603 [0, 14, 2] : f16 into vector<4x16x16xf16>
%606 = vector.extract %91[14] : f16 from vector<16xf16>
%607 = vector.insert %606, %605 [0, 14, 3] : f16 into vector<4x16x16xf16>
%608 = vector.extract %92[14] : f16 from vector<16xf16>
%609 = vector.insert %608, %607 [0, 14, 4] : f16 into vector<4x16x16xf16>
%610 = vector.extract %93[14] : f16 from vector<16xf16>
%611 = vector.insert %610, %609 [0, 14, 5] : f16 into vector<4x16x16xf16>
%612 = vector.extract %94[14] : f16 from vector<16xf16>
%613 = vector.insert %612, %611 [0, 14, 6] : f16 into vector<4x16x16xf16>
%614 = vector.extract %95[14] : f16 from vector<16xf16>
%615 = vector.insert %614, %613 [0, 14, 7] : f16 into vector<4x16x16xf16>
%616 = vector.extract %96[14] : f16 from vector<16xf16>
%617 = vector.insert %616, %615 [0, 14, 8] : f16 into vector<4x16x16xf16>
%618 = vector.extract %97[14] : f16 from vector<16xf16>
%619 = vector.insert %618, %617 [0, 14, 9] : f16 into vector<4x16x16xf16>
%620 = vector.extract %98[14] : f16 from vector<16xf16>
%621 = vector.insert %620, %619 [0, 14, 10] : f16 into vector<4x16x16xf16>
%622 = vector.extract %99[14] : f16 from vector<16xf16>
%623 = vector.insert %622, %621 [0, 14, 11] : f16 into vector<4x16x16xf16>
%624 = vector.extract %100[14] : f16 from vector<16xf16>
%625 = vector.insert %624, %623 [0, 14, 12] : f16 into vector<4x16x16xf16>
%626 = vector.extract %101[14] : f16 from vector<16xf16>
%627 = vector.insert %626, %625 [0, 14, 13] : f16 into vector<4x16x16xf16>
%628 = vector.extract %102[14] : f16 from vector<16xf16>
%629 = vector.insert %628, %627 [0, 14, 14] : f16 into vector<4x16x16xf16>
%630 = vector.extract %103[14] : f16 from vector<16xf16>
%631 = vector.insert %630, %629 [0, 14, 15] : f16 into vector<4x16x16xf16>
%632 = vector.extract %88[15] : f16 from vector<16xf16>
%633 = vector.insert %632, %631 [0, 15, 0] : f16 into vector<4x16x16xf16>
%634 = vector.extract %89[15] : f16 from vector<16xf16>
%635 = vector.insert %634, %633 [0, 15, 1] : f16 into vector<4x16x16xf16>
%636 = vector.extract %90[15] : f16 from vector<16xf16>
%637 = vector.insert %636, %635 [0, 15, 2] : f16 into vector<4x16x16xf16>
%638 = vector.extract %91[15] : f16 from vector<16xf16>
%639 = vector.insert %638, %637 [0, 15, 3] : f16 into vector<4x16x16xf16>
%640 = vector.extract %92[15] : f16 from vector<16xf16>
%641 = vector.insert %640, %639 [0, 15, 4] : f16 into vector<4x16x16xf16>
%642 = vector.extract %93[15] : f16 from vector<16xf16>
%643 = vector.insert %642, %641 [0, 15, 5] : f16 into vector<4x16x16xf16>
%644 = vector.extract %94[15] : f16 from vector<16xf16>
%645 = vector.insert %644, %643 [0, 15, 6] : f16 into vector<4x16x16xf16>
%646 = vector.extract %95[15] : f16 from vector<16xf16>
%647 = vector.insert %646, %645 [0, 15, 7] : f16 into vector<4x16x16xf16>
%648 = vector.extract %96[15] : f16 from vector<16xf16>
%649 = vector.insert %648, %647 [0, 15, 8] : f16 into vector<4x16x16xf16>
%650 = vector.extract %97[15] : f16 from vector<16xf16>
%651 = vector.insert %650, %649 [0, 15, 9] : f16 into vector<4x16x16xf16>
%652 = vector.extract %98[15] : f16 from vector<16xf16>
%653 = vector.insert %652, %651 [0, 15, 10] : f16 into vector<4x16x16xf16>
%654 = vector.extract %99[15] : f16 from vector<16xf16>
%655 = vector.insert %654, %653 [0, 15, 11] : f16 into vector<4x16x16xf16>
%656 = vector.extract %100[15] : f16 from vector<16xf16>
%657 = vector.insert %656, %655 [0, 15, 12] : f16 into vector<4x16x16xf16>
%658 = vector.extract %101[15] : f16 from vector<16xf16>
%659 = vector.insert %658, %657 [0, 15, 13] : f16 into vector<4x16x16xf16>
%660 = vector.extract %102[15] : f16 from vector<16xf16>
%661 = vector.insert %660, %659 [0, 15, 14] : f16 into vector<4x16x16xf16>
%662 = vector.extract %103[15] : f16 from vector<16xf16>
%663 = vector.insert %662, %661 [0, 15, 15] : f16 into vector<4x16x16xf16>
%664 = vector.extract %104[0] : f16 from vector<16xf16>
%665 = vector.insert %664, %663 [1, 0, 0] : f16 into vector<4x16x16xf16>
%666 = vector.extract %105[0] : f16 from vector<16xf16>
%667 = vector.insert %666, %665 [1, 0, 1] : f16 into vector<4x16x16xf16>
%668 = vector.extract %106[0] : f16 from vector<16xf16>
%669 = vector.insert %668, %667 [1, 0, 2] : f16 into vector<4x16x16xf16>
%670 = vector.extract %107[0] : f16 from vector<16xf16>
%671 = vector.insert %670, %669 [1, 0, 3] : f16 into vector<4x16x16xf16>
%672 = vector.extract %108[0] : f16 from vector<16xf16>
%673 = vector.insert %672, %671 [1, 0, 4] : f16 into vector<4x16x16xf16>
%674 = vector.extract %109[0] : f16 from vector<16xf16>
%675 = vector.insert %674, %673 [1, 0, 5] : f16 into vector<4x16x16xf16>
%676 = vector.extract %110[0] : f16 from vector<16xf16>
%677 = vector.insert %676, %675 [1, 0, 6] : f16 into vector<4x16x16xf16>
%678 = vector.extract %111[0] : f16 from vector<16xf16>
%679 = vector.insert %678, %677 [1, 0, 7] : f16 into vector<4x16x16xf16>
%680 = vector.extract %112[0] : f16 from vector<16xf16>
%681 = vector.insert %680, %679 [1, 0, 8] : f16 into vector<4x16x16xf16>
%682 = vector.extract %113[0] : f16 from vector<16xf16>
%683 = vector.insert %682, %681 [1, 0, 9] : f16 into vector<4x16x16xf16>
%684 = vector.extract %114[0] : f16 from vector<16xf16>
%685 = vector.insert %684, %683 [1, 0, 10] : f16 into vector<4x16x16xf16>
%686 = vector.extract %115[0] : f16 from vector<16xf16>
%687 = vector.insert %686, %685 [1, 0, 11] : f16 into vector<4x16x16xf16>
%688 = vector.extract %116[0] : f16 from vector<16xf16>
%689 = vector.insert %688, %687 [1, 0, 12] : f16 into vector<4x16x16xf16>
%690 = vector.extract %117[0] : f16 from vector<16xf16>
%691 = vector.insert %690, %689 [1, 0, 13] : f16 into vector<4x16x16xf16>
%692 = vector.extract %118[0] : f16 from vector<16xf16>
%693 = vector.insert %692, %691 [1, 0, 14] : f16 into vector<4x16x16xf16>
%694 = vector.extract %119[0] : f16 from vector<16xf16>
%695 = vector.insert %694, %693 [1, 0, 15] : f16 into vector<4x16x16xf16>
%696 = vector.extract %104[1] : f16 from vector<16xf16>
%697 = vector.insert %696, %695 [1, 1, 0] : f16 into vector<4x16x16xf16>
%698 = vector.extract %105[1] : f16 from vector<16xf16>
%699 = vector.insert %698, %697 [1, 1, 1] : f16 into vector<4x16x16xf16>
%700 = vector.extract %106[1] : f16 from vector<16xf16>
%701 = vector.insert %700, %699 [1, 1, 2] : f16 into vector<4x16x16xf16>
%702 = vector.extract %107[1] : f16 from vector<16xf16>
%703 = vector.insert %702, %701 [1, 1, 3] : f16 into vector<4x16x16xf16>
%704 = vector.extract %108[1] : f16 from vector<16xf16>
%705 = vector.insert %704, %703 [1, 1, 4] : f16 into vector<4x16x16xf16>
%706 = vector.extract %109[1] : f16 from vector<16xf16>
%707 = vector.insert %706, %705 [1, 1, 5] : f16 into vector<4x16x16xf16>
%708 = vector.extract %110[1] : f16 from vector<16xf16>
%709 = vector.insert %708, %707 [1, 1, 6] : f16 into vector<4x16x16xf16>
%710 = vector.extract %111[1] : f16 from vector<16xf16>
%711 = vector.insert %710, %709 [1, 1, 7] : f16 into vector<4x16x16xf16>
%712 = vector.extract %112[1] : f16 from vector<16xf16>
%713 = vector.insert %712, %711 [1, 1, 8] : f16 into vector<4x16x16xf16>
%714 = vector.extract %113[1] : f16 from vector<16xf16>
%715 = vector.insert %714, %713 [1, 1, 9] : f16 into vector<4x16x16xf16>
%716 = vector.extract %114[1] : f16 from vector<16xf16>
%717 = vector.insert %716, %715 [1, 1, 10] : f16 into vector<4x16x16xf16>
%718 = vector.extract %115[1] : f16 from vector<16xf16>
%719 = vector.insert %718, %717 [1, 1, 11] : f16 into vector<4x16x16xf16>
%720 = vector.extract %116[1] : f16 from vector<16xf16>
%721 = vector.insert %720, %719 [1, 1, 12] : f16 into vector<4x16x16xf16>
%722 = vector.extract %117[1] : f16 from vector<16xf16>
%723 = vector.insert %722, %721 [1, 1, 13] : f16 into vector<4x16x16xf16>
%724 = vector.extract %118[1] : f16 from vector<16xf16>
%725 = vector.insert %724, %723 [1, 1, 14] : f16 into vector<4x16x16xf16>
%726 = vector.extract %119[1] : f16 from vector<16xf16>
%727 = vector.insert %726, %725 [1, 1, 15] : f16 into vector<4x16x16xf16>
%728 = vector.extract %104[2] : f16 from vector<16xf16>
%729 = vector.insert %728, %727 [1, 2, 0] : f16 into vector<4x16x16xf16>
%730 = vector.extract %105[2] : f16 from vector<16xf16>
%731 = vector.insert %730, %729 [1, 2, 1] : f16 into vector<4x16x16xf16>
%732 = vector.extract %106[2] : f16 from vector<16xf16>
%733 = vector.insert %732, %731 [1, 2, 2] : f16 into vector<4x16x16xf16>
%734 = vector.extract %107[2] : f16 from vector<16xf16>
%735 = vector.insert %734, %733 [1, 2, 3] : f16 into vector<4x16x16xf16>
%736 = vector.extract %108[2] : f16 from vector<16xf16>
%737 = vector.insert %736, %735 [1, 2, 4] : f16 into vector<4x16x16xf16>
%738 = vector.extract %109[2] : f16 from vector<16xf16>
%739 = vector.insert %738, %737 [1, 2, 5] : f16 into vector<4x16x16xf16>
%740 = vector.extract %110[2] : f16 from vector<16xf16>
%741 = vector.insert %740, %739 [1, 2, 6] : f16 into vector<4x16x16xf16>
%742 = vector.extract %111[2] : f16 from vector<16xf16>
%743 = vector.insert %742, %741 [1, 2, 7] : f16 into vector<4x16x16xf16>
%744 = vector.extract %112[2] : f16 from vector<16xf16>
%745 = vector.insert %744, %743 [1, 2, 8] : f16 into vector<4x16x16xf16>
%746 = vector.extract %113[2] : f16 from vector<16xf16>
%747 = vector.insert %746, %745 [1, 2, 9] : f16 into vector<4x16x16xf16>
%748 = vector.extract %114[2] : f16 from vector<16xf16>
%749 = vector.insert %748, %747 [1, 2, 10] : f16 into vector<4x16x16xf16>
%750 = vector.extract %115[2] : f16 from vector<16xf16>
%751 = vector.insert %750, %749 [1, 2, 11] : f16 into vector<4x16x16xf16>
%752 = vector.extract %116[2] : f16 from vector<16xf16>
%753 = vector.insert %752, %751 [1, 2, 12] : f16 into vector<4x16x16xf16>
%754 = vector.extract %117[2] : f16 from vector<16xf16>
%755 = vector.insert %754, %753 [1, 2, 13] : f16 into vector<4x16x16xf16>
%756 = vector.extract %118[2] : f16 from vector<16xf16>
%757 = vector.insert %756, %755 [1, 2, 14] : f16 into vector<4x16x16xf16>
%758 = vector.extract %119[2] : f16 from vector<16xf16>
%759 = vector.insert %758, %757 [1, 2, 15] : f16 into vector<4x16x16xf16>
%760 = vector.extract %104[3] : f16 from vector<16xf16>
%761 = vector.insert %760, %759 [1, 3, 0] : f16 into vector<4x16x16xf16>
%762 = vector.extract %105[3] : f16 from vector<16xf16>
%763 = vector.insert %762, %761 [1, 3, 1] : f16 into vector<4x16x16xf16>
%764 = vector.extract %106[3] : f16 from vector<16xf16>
%765 = vector.insert %764, %763 [1, 3, 2] : f16 into vector<4x16x16xf16>
%766 = vector.extract %107[3] : f16 from vector<16xf16>
%767 = vector.insert %766, %765 [1, 3, 3] : f16 into vector<4x16x16xf16>
%768 = vector.extract %108[3] : f16 from vector<16xf16>
%769 = vector.insert %768, %767 [1, 3, 4] : f16 into vector<4x16x16xf16>
%770 = vector.extract %109[3] : f16 from vector<16xf16>
%771 = vector.insert %770, %769 [1, 3, 5] : f16 into vector<4x16x16xf16>
%772 = vector.extract %110[3] : f16 from vector<16xf16>
%773 = vector.insert %772, %771 [1, 3, 6] : f16 into vector<4x16x16xf16>
%774 = vector.extract %111[3] : f16 from vector<16xf16>
%775 = vector.insert %774, %773 [1, 3, 7] : f16 into vector<4x16x16xf16>
%776 = vector.extract %112[3] : f16 from vector<16xf16>
%777 = vector.insert %776, %775 [1, 3, 8] : f16 into vector<4x16x16xf16>
%778 = vector.extract %113[3] : f16 from vector<16xf16>
%779 = vector.insert %778, %777 [1, 3, 9] : f16 into vector<4x16x16xf16>
%780 = vector.extract %114[3] : f16 from vector<16xf16>
%781 = vector.insert %780, %779 [1, 3, 10] : f16 into vector<4x16x16xf16>
%782 = vector.extract %115[3] : f16 from vector<16xf16>
%783 = vector.insert %782, %781 [1, 3, 11] : f16 into vector<4x16x16xf16>
%784 = vector.extract %116[3] : f16 from vector<16xf16>
%785 = vector.insert %784, %783 [1, 3, 12] : f16 into vector<4x16x16xf16>
%786 = vector.extract %117[3] : f16 from vector<16xf16>
%787 = vector.insert %786, %785 [1, 3, 13] : f16 into vector<4x16x16xf16>
%788 = vector.extract %118[3] : f16 from vector<16xf16>
%789 = vector.insert %788, %787 [1, 3, 14] : f16 into vector<4x16x16xf16>
%790 = vector.extract %119[3] : f16 from vector<16xf16>
%791 = vector.insert %790, %789 [1, 3, 15] : f16 into vector<4x16x16xf16>
%792 = vector.extract %104[4] : f16 from vector<16xf16>
%793 = vector.insert %792, %791 [1, 4, 0] : f16 into vector<4x16x16xf16>
%794 = vector.extract %105[4] : f16 from vector<16xf16>
%795 = vector.insert %794, %793 [1, 4, 1] : f16 into vector<4x16x16xf16>
%796 = vector.extract %106[4] : f16 from vector<16xf16>
%797 = vector.insert %796, %795 [1, 4, 2] : f16 into vector<4x16x16xf16>
%798 = vector.extract %107[4] : f16 from vector<16xf16>
%799 = vector.insert %798, %797 [1, 4, 3] : f16 into vector<4x16x16xf16>
%800 = vector.extract %108[4] : f16 from vector<16xf16>
%801 = vector.insert %800, %799 [1, 4, 4] : f16 into vector<4x16x16xf16>
%802 = vector.extract %109[4] : f16 from vector<16xf16>
%803 = vector.insert %802, %801 [1, 4, 5] : f16 into vector<4x16x16xf16>
%804 = vector.extract %110[4] : f16 from vector<16xf16>
%805 = vector.insert %804, %803 [1, 4, 6] : f16 into vector<4x16x16xf16>
%806 = vector.extract %111[4] : f16 from vector<16xf16>
%807 = vector.insert %806, %805 [1, 4, 7] : f16 into vector<4x16x16xf16>
%808 = vector.extract %112[4] : f16 from vector<16xf16>
%809 = vector.insert %808, %807 [1, 4, 8] : f16 into vector<4x16x16xf16>
%810 = vector.extract %113[4] : f16 from vector<16xf16>
%811 = vector.insert %810, %809 [1, 4, 9] : f16 into vector<4x16x16xf16>
%812 = vector.extract %114[4] : f16 from vector<16xf16>
%813 = vector.insert %812, %811 [1, 4, 10] : f16 into vector<4x16x16xf16>
%814 = vector.extract %115[4] : f16 from vector<16xf16>
%815 = vector.insert %814, %813 [1, 4, 11] : f16 into vector<4x16x16xf16>
%816 = vector.extract %116[4] : f16 from vector<16xf16>
%817 = vector.insert %816, %815 [1, 4, 12] : f16 into vector<4x16x16xf16>
%818 = vector.extract %117[4] : f16 from vector<16xf16>
%819 = vector.insert %818, %817 [1, 4, 13] : f16 into vector<4x16x16xf16>
%820 = vector.extract %118[4] : f16 from vector<16xf16>
%821 = vector.insert %820, %819 [1, 4, 14] : f16 into vector<4x16x16xf16>
%822 = vector.extract %119[4] : f16 from vector<16xf16>
%823 = vector.insert %822, %821 [1, 4, 15] : f16 into vector<4x16x16xf16>
%824 = vector.extract %104[5] : f16 from vector<16xf16>
%825 = vector.insert %824, %823 [1, 5, 0] : f16 into vector<4x16x16xf16>
%826 = vector.extract %105[5] : f16 from vector<16xf16>
%827 = vector.insert %826, %825 [1, 5, 1] : f16 into vector<4x16x16xf16>
%828 = vector.extract %106[5] : f16 from vector<16xf16>
%829 = vector.insert %828, %827 [1, 5, 2] : f16 into vector<4x16x16xf16>
%830 = vector.extract %107[5] : f16 from vector<16xf16>
%831 = vector.insert %830, %829 [1, 5, 3] : f16 into vector<4x16x16xf16>
%832 = vector.extract %108[5] : f16 from vector<16xf16>
%833 = vector.insert %832, %831 [1, 5, 4] : f16 into vector<4x16x16xf16>
%834 = vector.extract %109[5] : f16 from vector<16xf16>
%835 = vector.insert %834, %833 [1, 5, 5] : f16 into vector<4x16x16xf16>
%836 = vector.extract %110[5] : f16 from vector<16xf16>
%837 = vector.insert %836, %835 [1, 5, 6] : f16 into vector<4x16x16xf16>
%838 = vector.extract %111[5] : f16 from vector<16xf16>
%839 = vector.insert %838, %837 [1, 5, 7] : f16 into vector<4x16x16xf16>
%840 = vector.extract %112[5] : f16 from vector<16xf16>
%841 = vector.insert %840, %839 [1, 5, 8] : f16 into vector<4x16x16xf16>
%842 = vector.extract %113[5] : f16 from vector<16xf16>
%843 = vector.insert %842, %841 [1, 5, 9] : f16 into vector<4x16x16xf16>
%844 = vector.extract %114[5] : f16 from vector<16xf16>
%845 = vector.insert %844, %843 [1, 5, 10] : f16 into vector<4x16x16xf16>
%846 = vector.extract %115[5] : f16 from vector<16xf16>
%847 = vector.insert %846, %845 [1, 5, 11] : f16 into vector<4x16x16xf16>
%848 = vector.extract %116[5] : f16 from vector<16xf16>
%849 = vector.insert %848, %847 [1, 5, 12] : f16 into vector<4x16x16xf16>
%850 = vector.extract %117[5] : f16 from vector<16xf16>
%851 = vector.insert %850, %849 [1, 5, 13] : f16 into vector<4x16x16xf16>
%852 = vector.extract %118[5] : f16 from vector<16xf16>
%853 = vector.insert %852, %851 [1, 5, 14] : f16 into vector<4x16x16xf16>
%854 = vector.extract %119[5] : f16 from vector<16xf16>
%855 = vector.insert %854, %853 [1, 5, 15] : f16 into vector<4x16x16xf16>
%856 = vector.extract %104[6] : f16 from vector<16xf16>
%857 = vector.insert %856, %855 [1, 6, 0] : f16 into vector<4x16x16xf16>
%858 = vector.extract %105[6] : f16 from vector<16xf16>
%859 = vector.insert %858, %857 [1, 6, 1] : f16 into vector<4x16x16xf16>
%860 = vector.extract %106[6] : f16 from vector<16xf16>
%861 = vector.insert %860, %859 [1, 6, 2] : f16 into vector<4x16x16xf16>
%862 = vector.extract %107[6] : f16 from vector<16xf16>
%863 = vector.insert %862, %861 [1, 6, 3] : f16 into vector<4x16x16xf16>
%864 = vector.extract %108[6] : f16 from vector<16xf16>
%865 = vector.insert %864, %863 [1, 6, 4] : f16 into vector<4x16x16xf16>
%866 = vector.extract %109[6] : f16 from vector<16xf16>
%867 = vector.insert %866, %865 [1, 6, 5] : f16 into vector<4x16x16xf16>
%868 = vector.extract %110[6] : f16 from vector<16xf16>
%869 = vector.insert %868, %867 [1, 6, 6] : f16 into vector<4x16x16xf16>
%870 = vector.extract %111[6] : f16 from vector<16xf16>
%871 = vector.insert %870, %869 [1, 6, 7] : f16 into vector<4x16x16xf16>
%872 = vector.extract %112[6] : f16 from vector<16xf16>
%873 = vector.insert %872, %871 [1, 6, 8] : f16 into vector<4x16x16xf16>
%874 = vector.extract %113[6] : f16 from vector<16xf16>
%875 = vector.insert %874, %873 [1, 6, 9] : f16 into vector<4x16x16xf16>
%876 = vector.extract %114[6] : f16 from vector<16xf16>
%877 = vector.insert %876, %875 [1, 6, 10] : f16 into vector<4x16x16xf16>
%878 = vector.extract %115[6] : f16 from vector<16xf16>
%879 = vector.insert %878, %877 [1, 6, 11] : f16 into vector<4x16x16xf16>
%880 = vector.extract %116[6] : f16 from vector<16xf16>
%881 = vector.insert %880, %879 [1, 6, 12] : f16 into vector<4x16x16xf16>
%882 = vector.extract %117[6] : f16 from vector<16xf16>
%883 = vector.insert %882, %881 [1, 6, 13] : f16 into vector<4x16x16xf16>
%884 = vector.extract %118[6] : f16 from vector<16xf16>
%885 = vector.insert %884, %883 [1, 6, 14] : f16 into vector<4x16x16xf16>
%886 = vector.extract %119[6] : f16 from vector<16xf16>
%887 = vector.insert %886, %885 [1, 6, 15] : f16 into vector<4x16x16xf16>
%888 = vector.extract %104[7] : f16 from vector<16xf16>
%889 = vector.insert %888, %887 [1, 7, 0] : f16 into vector<4x16x16xf16>
%890 = vector.extract %105[7] : f16 from vector<16xf16>
%891 = vector.insert %890, %889 [1, 7, 1] : f16 into vector<4x16x16xf16>
%892 = vector.extract %106[7] : f16 from vector<16xf16>
%893 = vector.insert %892, %891 [1, 7, 2] : f16 into vector<4x16x16xf16>
%894 = vector.extract %107[7] : f16 from vector<16xf16>
%895 = vector.insert %894, %893 [1, 7, 3] : f16 into vector<4x16x16xf16>
%896 = vector.extract %108[7] : f16 from vector<16xf16>
%897 = vector.insert %896, %895 [1, 7, 4] : f16 into vector<4x16x16xf16>
%898 = vector.extract %109[7] : f16 from vector<16xf16>
%899 = vector.insert %898, %897 [1, 7, 5] : f16 into vector<4x16x16xf16>
%900 = vector.extract %110[7] : f16 from vector<16xf16>
%901 = vector.insert %900, %899 [1, 7, 6] : f16 into vector<4x16x16xf16>
%902 = vector.extract %111[7] : f16 from vector<16xf16>
%903 = vector.insert %902, %901 [1, 7, 7] : f16 into vector<4x16x16xf16>
%904 = vector.extract %112[7] : f16 from vector<16xf16>
%905 = vector.insert %904, %903 [1, 7, 8] : f16 into vector<4x16x16xf16>
%906 = vector.extract %113[7] : f16 from vector<16xf16>
%907 = vector.insert %906, %905 [1, 7, 9] : f16 into vector<4x16x16xf16>
%908 = vector.extract %114[7] : f16 from vector<16xf16>
%909 = vector.insert %908, %907 [1, 7, 10] : f16 into vector<4x16x16xf16>
%910 = vector.extract %115[7] : f16 from vector<16xf16>
%911 = vector.insert %910, %909 [1, 7, 11] : f16 into vector<4x16x16xf16>
%912 = vector.extract %116[7] : f16 from vector<16xf16>
%913 = vector.insert %912, %911 [1, 7, 12] : f16 into vector<4x16x16xf16>
%914 = vector.extract %117[7] : f16 from vector<16xf16>
%915 = vector.insert %914, %913 [1, 7, 13] : f16 into vector<4x16x16xf16>
%916 = vector.extract %118[7] : f16 from vector<16xf16>
%917 = vector.insert %916, %915 [1, 7, 14] : f16 into vector<4x16x16xf16>
%918 = vector.extract %119[7] : f16 from vector<16xf16>
%919 = vector.insert %918, %917 [1, 7, 15] : f16 into vector<4x16x16xf16>
%920 = vector.extract %104[8] : f16 from vector<16xf16>
%921 = vector.insert %920, %919 [1, 8, 0] : f16 into vector<4x16x16xf16>
%922 = vector.extract %105[8] : f16 from vector<16xf16>
%923 = vector.insert %922, %921 [1, 8, 1] : f16 into vector<4x16x16xf16>
%924 = vector.extract %106[8] : f16 from vector<16xf16>
%925 = vector.insert %924, %923 [1, 8, 2] : f16 into vector<4x16x16xf16>
%926 = vector.extract %107[8] : f16 from vector<16xf16>
%927 = vector.insert %926, %925 [1, 8, 3] : f16 into vector<4x16x16xf16>
%928 = vector.extract %108[8] : f16 from vector<16xf16>
%929 = vector.insert %928, %927 [1, 8, 4] : f16 into vector<4x16x16xf16>
%930 = vector.extract %109[8] : f16 from vector<16xf16>
%931 = vector.insert %930, %929 [1, 8, 5] : f16 into vector<4x16x16xf16>
%932 = vector.extract %110[8] : f16 from vector<16xf16>
%933 = vector.insert %932, %931 [1, 8, 6] : f16 into vector<4x16x16xf16>
%934 = vector.extract %111[8] : f16 from vector<16xf16>
%935 = vector.insert %934, %933 [1, 8, 7] : f16 into vector<4x16x16xf16>
%936 = vector.extract %112[8] : f16 from vector<16xf16>
%937 = vector.insert %936, %935 [1, 8, 8] : f16 into vector<4x16x16xf16>
%938 = vector.extract %113[8] : f16 from vector<16xf16>
%939 = vector.insert %938, %937 [1, 8, 9] : f16 into vector<4x16x16xf16>
%940 = vector.extract %114[8] : f16 from vector<16xf16>
%941 = vector.insert %940, %939 [1, 8, 10] : f16 into vector<4x16x16xf16>
%942 = vector.extract %115[8] : f16 from vector<16xf16>
%943 = vector.insert %942, %941 [1, 8, 11] : f16 into vector<4x16x16xf16>
%944 = vector.extract %116[8] : f16 from vector<16xf16>
%945 = vector.insert %944, %943 [1, 8, 12] : f16 into vector<4x16x16xf16>
%946 = vector.extract %117[8] : f16 from vector<16xf16>
%947 = vector.insert %946, %945 [1, 8, 13] : f16 into vector<4x16x16xf16>
%948 = vector.extract %118[8] : f16 from vector<16xf16>
%949 = vector.insert %948, %947 [1, 8, 14] : f16 into vector<4x16x16xf16>
%950 = vector.extract %119[8] : f16 from vector<16xf16>
%951 = vector.insert %950, %949 [1, 8, 15] : f16 into vector<4x16x16xf16>
%952 = vector.extract %104[9] : f16 from vector<16xf16>
%953 = vector.insert %952, %951 [1, 9, 0] : f16 into vector<4x16x16xf16>
%954 = vector.extract %105[9] : f16 from vector<16xf16>
%955 = vector.insert %954, %953 [1, 9, 1] : f16 into vector<4x16x16xf16>
%956 = vector.extract %106[9] : f16 from vector<16xf16>
%957 = vector.insert %956, %955 [1, 9, 2] : f16 into vector<4x16x16xf16>
%958 = vector.extract %107[9] : f16 from vector<16xf16>
%959 = vector.insert %958, %957 [1, 9, 3] : f16 into vector<4x16x16xf16>
%960 = vector.extract %108[9] : f16 from vector<16xf16>
%961 = vector.insert %960, %959 [1, 9, 4] : f16 into vector<4x16x16xf16>
%962 = vector.extract %109[9] : f16 from vector<16xf16>
%963 = vector.insert %962, %961 [1, 9, 5] : f16 into vector<4x16x16xf16>
%964 = vector.extract %110[9] : f16 from vector<16xf16>
%965 = vector.insert %964, %963 [1, 9, 6] : f16 into vector<4x16x16xf16>
%966 = vector.extract %111[9] : f16 from vector<16xf16>
%967 = vector.insert %966, %965 [1, 9, 7] : f16 into vector<4x16x16xf16>
%968 = vector.extract %112[9] : f16 from vector<16xf16>
%969 = vector.insert %968, %967 [1, 9, 8] : f16 into vector<4x16x16xf16>
%970 = vector.extract %113[9] : f16 from vector<16xf16>
%971 = vector.insert %970, %969 [1, 9, 9] : f16 into vector<4x16x16xf16>
%972 = vector.extract %114[9] : f16 from vector<16xf16>
%973 = vector.insert %972, %971 [1, 9, 10] : f16 into vector<4x16x16xf16>
%974 = vector.extract %115[9] : f16 from vector<16xf16>
%975 = vector.insert %974, %973 [1, 9, 11] : f16 into vector<4x16x16xf16>
%976 = vector.extract %116[9] : f16 from vector<16xf16>
%977 = vector.insert %976, %975 [1, 9, 12] : f16 into vector<4x16x16xf16>
%978 = vector.extract %117[9] : f16 from vector<16xf16>
%979 = vector.insert %978, %977 [1, 9, 13] : f16 into vector<4x16x16xf16>
%980 = vector.extract %118[9] : f16 from vector<16xf16>
%981 = vector.insert %980, %979 [1, 9, 14] : f16 into vector<4x16x16xf16>
%982 = vector.extract %119[9] : f16 from vector<16xf16>
%983 = vector.insert %982, %981 [1, 9, 15] : f16 into vector<4x16x16xf16>
%984 = vector.extract %104[10] : f16 from vector<16xf16>
%985 = vector.insert %984, %983 [1, 10, 0] : f16 into vector<4x16x16xf16>
%986 = vector.extract %105[10] : f16 from vector<16xf16>
%987 = vector.insert %986, %985 [1, 10, 1] : f16 into vector<4x16x16xf16>
%988 = vector.extract %106[10] : f16 from vector<16xf16>
%989 = vector.insert %988, %987 [1, 10, 2] : f16 into vector<4x16x16xf16>
%990 = vector.extract %107[10] : f16 from vector<16xf16>
%991 = vector.insert %990, %989 [1, 10, 3] : f16 into vector<4x16x16xf16>
%992 = vector.extract %108[10] : f16 from vector<16xf16>
%993 = vector.insert %992, %991 [1, 10, 4] : f16 into vector<4x16x16xf16>
%994 = vector.extract %109[10] : f16 from vector<16xf16>
%995 = vector.insert %994, %993 [1, 10, 5] : f16 into vector<4x16x16xf16>
%996 = vector.extract %110[10] : f16 from vector<16xf16>
%997 = vector.insert %996, %995 [1, 10, 6] : f16 into vector<4x16x16xf16>
%998 = vector.extract %111[10] : f16 from vector<16xf16>
%999 = vector.insert %998, %997 [1, 10, 7] : f16 into vector<4x16x16xf16>
%1000 = vector.extract %112[10] : f16 from vector<16xf16>
%1001 = vector.insert %1000, %999 [1, 10, 8] : f16 into vector<4x16x16xf16>
%1002 = vector.extract %113[10] : f16 from vector<16xf16>
%1003 = vector.insert %1002, %1001 [1, 10, 9] : f16 into vector<4x16x16xf16>
%1004 = vector.extract %114[10] : f16 from vector<16xf16>
%1005 = vector.insert %1004, %1003 [1, 10, 10] : f16 into vector<4x16x16xf16>
%1006 = vector.extract %115[10] : f16 from vector<16xf16>
%1007 = vector.insert %1006, %1005 [1, 10, 11] : f16 into vector<4x16x16xf16>
%1008 = vector.extract %116[10] : f16 from vector<16xf16>
%1009 = vector.insert %1008, %1007 [1, 10, 12] : f16 into vector<4x16x16xf16>
%1010 = vector.extract %117[10] : f16 from vector<16xf16>
%1011 = vector.insert %1010, %1009 [1, 10, 13] : f16 into vector<4x16x16xf16>
%1012 = vector.extract %118[10] : f16 from vector<16xf16>
%1013 = vector.insert %1012, %1011 [1, 10, 14] : f16 into vector<4x16x16xf16>
%1014 = vector.extract %119[10] : f16 from vector<16xf16>
%1015 = vector.insert %1014, %1013 [1, 10, 15] : f16 into vector<4x16x16xf16>
%1016 = vector.extract %104[11] : f16 from vector<16xf16>
%1017 = vector.insert %1016, %1015 [1, 11, 0] : f16 into vector<4x16x16xf16>
%1018 = vector.extract %105[11] : f16 from vector<16xf16>
%1019 = vector.insert %1018, %1017 [1, 11, 1] : f16 into vector<4x16x16xf16>
%1020 = vector.extract %106[11] : f16 from vector<16xf16>
%1021 = vector.insert %1020, %1019 [1, 11, 2] : f16 into vector<4x16x16xf16>
%1022 = vector.extract %107[11] : f16 from vector<16xf16>
%1023 = vector.insert %1022, %1021 [1, 11, 3] : f16 into vector<4x16x16xf16>
%1024 = vector.extract %108[11] : f16 from vector<16xf16>
%1025 = vector.insert %1024, %1023 [1, 11, 4] : f16 into vector<4x16x16xf16>
%1026 = vector.extract %109[11] : f16 from vector<16xf16>
%1027 = vector.insert %1026, %1025 [1, 11, 5] : f16 into vector<4x16x16xf16>
%1028 = vector.extract %110[11] : f16 from vector<16xf16>
%1029 = vector.insert %1028, %1027 [1, 11, 6] : f16 into vector<4x16x16xf16>
%1030 = vector.extract %111[11] : f16 from vector<16xf16>
%1031 = vector.insert %1030, %1029 [1, 11, 7] : f16 into vector<4x16x16xf16>
%1032 = vector.extract %112[11] : f16 from vector<16xf16>
%1033 = vector.insert %1032, %1031 [1, 11, 8] : f16 into vector<4x16x16xf16>
%1034 = vector.extract %113[11] : f16 from vector<16xf16>
%1035 = vector.insert %1034, %1033 [1, 11, 9] : f16 into vector<4x16x16xf16>
%1036 = vector.extract %114[11] : f16 from vector<16xf16>
%1037 = vector.insert %1036, %1035 [1, 11, 10] : f16 into vector<4x16x16xf16>
%1038 = vector.extract %115[11] : f16 from vector<16xf16>
%1039 = vector.insert %1038, %1037 [1, 11, 11] : f16 into vector<4x16x16xf16>
%1040 = vector.extract %116[11] : f16 from vector<16xf16>
%1041 = vector.insert %1040, %1039 [1, 11, 12] : f16 into vector<4x16x16xf16>
%1042 = vector.extract %117[11] : f16 from vector<16xf16>
%1043 = vector.insert %1042, %1041 [1, 11, 13] : f16 into vector<4x16x16xf16>
%1044 = vector.extract %118[11] : f16 from vector<16xf16>
%1045 = vector.insert %1044, %1043 [1, 11, 14] : f16 into vector<4x16x16xf16>
%1046 = vector.extract %119[11] : f16 from vector<16xf16>
%1047 = vector.insert %1046, %1045 [1, 11, 15] : f16 into vector<4x16x16xf16>
%1048 = vector.extract %104[12] : f16 from vector<16xf16>
%1049 = vector.insert %1048, %1047 [1, 12, 0] : f16 into vector<4x16x16xf16>
%1050 = vector.extract %105[12] : f16 from vector<16xf16>
%1051 = vector.insert %1050, %1049 [1, 12, 1] : f16 into vector<4x16x16xf16>
%1052 = vector.extract %106[12] : f16 from vector<16xf16>
%1053 = vector.insert %1052, %1051 [1, 12, 2] : f16 into vector<4x16x16xf16>
%1054 = vector.extract %107[12] : f16 from vector<16xf16>
%1055 = vector.insert %1054, %1053 [1, 12, 3] : f16 into vector<4x16x16xf16>
%1056 = vector.extract %108[12] : f16 from vector<16xf16>
%1057 = vector.insert %1056, %1055 [1, 12, 4] : f16 into vector<4x16x16xf16>
%1058 = vector.extract %109[12] : f16 from vector<16xf16>
%1059 = vector.insert %1058, %1057 [1, 12, 5] : f16 into vector<4x16x16xf16>
%1060 = vector.extract %110[12] : f16 from vector<16xf16>
%1061 = vector.insert %1060, %1059 [1, 12, 6] : f16 into vector<4x16x16xf16>
%1062 = vector.extract %111[12] : f16 from vector<16xf16>
%1063 = vector.insert %1062, %1061 [1, 12, 7] : f16 into vector<4x16x16xf16>
%1064 = vector.extract %112[12] : f16 from vector<16xf16>
%1065 = vector.insert %1064, %1063 [1, 12, 8] : f16 into vector<4x16x16xf16>
%1066 = vector.extract %113[12] : f16 from vector<16xf16>
%1067 = vector.insert %1066, %1065 [1, 12, 9] : f16 into vector<4x16x16xf16>
%1068 = vector.extract %114[12] : f16 from vector<16xf16>
%1069 = vector.insert %1068, %1067 [1, 12, 10] : f16 into vector<4x16x16xf16>
%1070 = vector.extract %115[12] : f16 from vector<16xf16>
%1071 = vector.insert %1070, %1069 [1, 12, 11] : f16 into vector<4x16x16xf16>
%1072 = vector.extract %116[12] : f16 from vector<16xf16>
%1073 = vector.insert %1072, %1071 [1, 12, 12] : f16 into vector<4x16x16xf16>
%1074 = vector.extract %117[12] : f16 from vector<16xf16>
%1075 = vector.insert %1074, %1073 [1, 12, 13] : f16 into vector<4x16x16xf16>
%1076 = vector.extract %118[12] : f16 from vector<16xf16>
%1077 = vector.insert %1076, %1075 [1, 12, 14] : f16 into vector<4x16x16xf16>
%1078 = vector.extract %119[12] : f16 from vector<16xf16>
%1079 = vector.insert %1078, %1077 [1, 12, 15] : f16 into vector<4x16x16xf16>
%1080 = vector.extract %104[13] : f16 from vector<16xf16>
%1081 = vector.insert %1080, %1079 [1, 13, 0] : f16 into vector<4x16x16xf16>
%1082 = vector.extract %105[13] : f16 from vector<16xf16>
%1083 = vector.insert %1082, %1081 [1, 13, 1] : f16 into vector<4x16x16xf16>
%1084 = vector.extract %106[13] : f16 from vector<16xf16>
%1085 = vector.insert %1084, %1083 [1, 13, 2] : f16 into vector<4x16x16xf16>
%1086 = vector.extract %107[13] : f16 from vector<16xf16>
%1087 = vector.insert %1086, %1085 [1, 13, 3] : f16 into vector<4x16x16xf16>
%1088 = vector.extract %108[13] : f16 from vector<16xf16>
%1089 = vector.insert %1088, %1087 [1, 13, 4] : f16 into vector<4x16x16xf16>
%1090 = vector.extract %109[13] : f16 from vector<16xf16>
%1091 = vector.insert %1090, %1089 [1, 13, 5] : f16 into vector<4x16x16xf16>
%1092 = vector.extract %110[13] : f16 from vector<16xf16>
%1093 = vector.insert %1092, %1091 [1, 13, 6] : f16 into vector<4x16x16xf16>
%1094 = vector.extract %111[13] : f16 from vector<16xf16>
%1095 = vector.insert %1094, %1093 [1, 13, 7] : f16 into vector<4x16x16xf16>
%1096 = vector.extract %112[13] : f16 from vector<16xf16>
%1097 = vector.insert %1096, %1095 [1, 13, 8] : f16 into vector<4x16x16xf16>
%1098 = vector.extract %113[13] : f16 from vector<16xf16>
%1099 = vector.insert %1098, %1097 [1, 13, 9] : f16 into vector<4x16x16xf16>
%1100 = vector.extract %114[13] : f16 from vector<16xf16>
%1101 = vector.insert %1100, %1099 [1, 13, 10] : f16 into vector<4x16x16xf16>
%1102 = vector.extract %115[13] : f16 from vector<16xf16>
%1103 = vector.insert %1102, %1101 [1, 13, 11] : f16 into vector<4x16x16xf16>
%1104 = vector.extract %116[13] : f16 from vector<16xf16>
%1105 = vector.insert %1104, %1103 [1, 13, 12] : f16 into vector<4x16x16xf16>
%1106 = vector.extract %117[13] : f16 from vector<16xf16>
%1107 = vector.insert %1106, %1105 [1, 13, 13] : f16 into vector<4x16x16xf16>
%1108 = vector.extract %118[13] : f16 from vector<16xf16>
%1109 = vector.insert %1108, %1107 [1, 13, 14] : f16 into vector<4x16x16xf16>
%1110 = vector.extract %119[13] : f16 from vector<16xf16>
%1111 = vector.insert %1110, %1109 [1, 13, 15] : f16 into vector<4x16x16xf16>
%1112 = vector.extract %104[14] : f16 from vector<16xf16>
%1113 = vector.insert %1112, %1111 [1, 14, 0] : f16 into vector<4x16x16xf16>
%1114 = vector.extract %105[14] : f16 from vector<16xf16>
%1115 = vector.insert %1114, %1113 [1, 14, 1] : f16 into vector<4x16x16xf16>
%1116 = vector.extract %106[14] : f16 from vector<16xf16>
%1117 = vector.insert %1116, %1115 [1, 14, 2] : f16 into vector<4x16x16xf16>
%1118 = vector.extract %107[14] : f16 from vector<16xf16>
%1119 = vector.insert %1118, %1117 [1, 14, 3] : f16 into vector<4x16x16xf16>
%1120 = vector.extract %108[14] : f16 from vector<16xf16>
%1121 = vector.insert %1120, %1119 [1, 14, 4] : f16 into vector<4x16x16xf16>
%1122 = vector.extract %109[14] : f16 from vector<16xf16>
%1123 = vector.insert %1122, %1121 [1, 14, 5] : f16 into vector<4x16x16xf16>
%1124 = vector.extract %110[14] : f16 from vector<16xf16>
%1125 = vector.insert %1124, %1123 [1, 14, 6] : f16 into vector<4x16x16xf16>
%1126 = vector.extract %111[14] : f16 from vector<16xf16>
%1127 = vector.insert %1126, %1125 [1, 14, 7] : f16 into vector<4x16x16xf16>
%1128 = vector.extract %112[14] : f16 from vector<16xf16>
%1129 = vector.insert %1128, %1127 [1, 14, 8] : f16 into vector<4x16x16xf16>
%1130 = vector.extract %113[14] : f16 from vector<16xf16>
%1131 = vector.insert %1130, %1129 [1, 14, 9] : f16 into vector<4x16x16xf16>
%1132 = vector.extract %114[14] : f16 from vector<16xf16>
%1133 = vector.insert %1132, %1131 [1, 14, 10] : f16 into vector<4x16x16xf16>
%1134 = vector.extract %115[14] : f16 from vector<16xf16>
%1135 = vector.insert %1134, %1133 [1, 14, 11] : f16 into vector<4x16x16xf16>
%1136 = vector.extract %116[14] : f16 from vector<16xf16>
%1137 = vector.insert %1136, %1135 [1, 14, 12] : f16 into vector<4x16x16xf16>
%1138 = vector.extract %117[14] : f16 from vector<16xf16>
%1139 = vector.insert %1138, %1137 [1, 14, 13] : f16 into vector<4x16x16xf16>
%1140 = vector.extract %118[14] : f16 from vector<16xf16>
%1141 = vector.insert %1140, %1139 [1, 14, 14] : f16 into vector<4x16x16xf16>
%1142 = vector.extract %119[14] : f16 from vector<16xf16>
%1143 = vector.insert %1142, %1141 [1, 14, 15] : f16 into vector<4x16x16xf16>
%1144 = vector.extract %104[15] : f16 from vector<16xf16>
%1145 = vector.insert %1144, %1143 [1, 15, 0] : f16 into vector<4x16x16xf16>
%1146 = vector.extract %105[15] : f16 from vector<16xf16>
%1147 = vector.insert %1146, %1145 [1, 15, 1] : f16 into vector<4x16x16xf16>
%1148 = vector.extract %106[15] : f16 from vector<16xf16>
%1149 = vector.insert %1148, %1147 [1, 15, 2] : f16 into vector<4x16x16xf16>
%1150 = vector.extract %107[15] : f16 from vector<16xf16>
%1151 = vector.insert %1150, %1149 [1, 15, 3] : f16 into vector<4x16x16xf16>
%1152 = vector.extract %108[15] : f16 from vector<16xf16>
%1153 = vector.insert %1152, %1151 [1, 15, 4] : f16 into vector<4x16x16xf16>
%1154 = vector.extract %109[15] : f16 from vector<16xf16>
%1155 = vector.insert %1154, %1153 [1, 15, 5] : f16 into vector<4x16x16xf16>
%1156 = vector.extract %110[15] : f16 from vector<16xf16>
%1157 = vector.insert %1156, %1155 [1, 15, 6] : f16 into vector<4x16x16xf16>
%1158 = vector.extract %111[15] : f16 from vector<16xf16>
%1159 = vector.insert %1158, %1157 [1, 15, 7] : f16 into vector<4x16x16xf16>
%1160 = vector.extract %112[15] : f16 from vector<16xf16>
%1161 = vector.insert %1160, %1159 [1, 15, 8] : f16 into vector<4x16x16xf16>
%1162 = vector.extract %113[15] : f16 from vector<16xf16>
%1163 = vector.insert %1162, %1161 [1, 15, 9] : f16 into vector<4x16x16xf16>
%1164 = vector.extract %114[15] : f16 from vector<16xf16>
%1165 = vector.insert %1164, %1163 [1, 15, 10] : f16 into vector<4x16x16xf16>
%1166 = vector.extract %115[15] : f16 from vector<16xf16>
%1167 = vector.insert %1166, %1165 [1, 15, 11] : f16 into vector<4x16x16xf16>
%1168 = vector.extract %116[15] : f16 from vector<16xf16>
%1169 = vector.insert %1168, %1167 [1, 15, 12] : f16 into vector<4x16x16xf16>
%1170 = vector.extract %117[15] : f16 from vector<16xf16>
%1171 = vector.insert %1170, %1169 [1, 15, 13] : f16 into vector<4x16x16xf16>
%1172 = vector.extract %118[15] : f16 from vector<16xf16>
%1173 = vector.insert %1172, %1171 [1, 15, 14] : f16 into vector<4x16x16xf16>
%1174 = vector.extract %119[15] : f16 from vector<16xf16>
%1175 = vector.insert %1174, %1173 [1, 15, 15] : f16 into vector<4x16x16xf16>
%1176 = vector.extract %120[0] : f16 from vector<16xf16>
%1177 = vector.insert %1176, %1175 [2, 0, 0] : f16 into vector<4x16x16xf16>
%1178 = vector.extract %121[0] : f16 from vector<16xf16>
%1179 = vector.insert %1178, %1177 [2, 0, 1] : f16 into vector<4x16x16xf16>
%1180 = vector.extract %122[0] : f16 from vector<16xf16>
%1181 = vector.insert %1180, %1179 [2, 0, 2] : f16 into vector<4x16x16xf16>
%1182 = vector.extract %123[0] : f16 from vector<16xf16>
%1183 = vector.insert %1182, %1181 [2, 0, 3] : f16 into vector<4x16x16xf16>
%1184 = vector.extract %124[0] : f16 from vector<16xf16>
%1185 = vector.insert %1184, %1183 [2, 0, 4] : f16 into vector<4x16x16xf16>
%1186 = vector.extract %125[0] : f16 from vector<16xf16>
%1187 = vector.insert %1186, %1185 [2, 0, 5] : f16 into vector<4x16x16xf16>
%1188 = vector.extract %126[0] : f16 from vector<16xf16>
%1189 = vector.insert %1188, %1187 [2, 0, 6] : f16 into vector<4x16x16xf16>
%1190 = vector.extract %127[0] : f16 from vector<16xf16>
%1191 = vector.insert %1190, %1189 [2, 0, 7] : f16 into vector<4x16x16xf16>
%1192 = vector.extract %128[0] : f16 from vector<16xf16>
%1193 = vector.insert %1192, %1191 [2, 0, 8] : f16 into vector<4x16x16xf16>
%1194 = vector.extract %129[0] : f16 from vector<16xf16>
%1195 = vector.insert %1194, %1193 [2, 0, 9] : f16 into vector<4x16x16xf16>
%1196 = vector.extract %130[0] : f16 from vector<16xf16>
%1197 = vector.insert %1196, %1195 [2, 0, 10] : f16 into vector<4x16x16xf16>
%1198 = vector.extract %131[0] : f16 from vector<16xf16>
%1199 = vector.insert %1198, %1197 [2, 0, 11] : f16 into vector<4x16x16xf16>
%1200 = vector.extract %132[0] : f16 from vector<16xf16>
%1201 = vector.insert %1200, %1199 [2, 0, 12] : f16 into vector<4x16x16xf16>
%1202 = vector.extract %133[0] : f16 from vector<16xf16>
%1203 = vector.insert %1202, %1201 [2, 0, 13] : f16 into vector<4x16x16xf16>
%1204 = vector.extract %134[0] : f16 from vector<16xf16>
%1205 = vector.insert %1204, %1203 [2, 0, 14] : f16 into vector<4x16x16xf16>
%1206 = vector.extract %135[0] : f16 from vector<16xf16>
%1207 = vector.insert %1206, %1205 [2, 0, 15] : f16 into vector<4x16x16xf16>
%1208 = vector.extract %120[1] : f16 from vector<16xf16>
%1209 = vector.insert %1208, %1207 [2, 1, 0] : f16 into vector<4x16x16xf16>
%1210 = vector.extract %121[1] : f16 from vector<16xf16>
%1211 = vector.insert %1210, %1209 [2, 1, 1] : f16 into vector<4x16x16xf16>
%1212 = vector.extract %122[1] : f16 from vector<16xf16>
%1213 = vector.insert %1212, %1211 [2, 1, 2] : f16 into vector<4x16x16xf16>
%1214 = vector.extract %123[1] : f16 from vector<16xf16>
%1215 = vector.insert %1214, %1213 [2, 1, 3] : f16 into vector<4x16x16xf16>
%1216 = vector.extract %124[1] : f16 from vector<16xf16>
%1217 = vector.insert %1216, %1215 [2, 1, 4] : f16 into vector<4x16x16xf16>
%1218 = vector.extract %125[1] : f16 from vector<16xf16>
%1219 = vector.insert %1218, %1217 [2, 1, 5] : f16 into vector<4x16x16xf16>
%1220 = vector.extract %126[1] : f16 from vector<16xf16>
%1221 = vector.insert %1220, %1219 [2, 1, 6] : f16 into vector<4x16x16xf16>
%1222 = vector.extract %127[1] : f16 from vector<16xf16>
%1223 = vector.insert %1222, %1221 [2, 1, 7] : f16 into vector<4x16x16xf16>
%1224 = vector.extract %128[1] : f16 from vector<16xf16>
%1225 = vector.insert %1224, %1223 [2, 1, 8] : f16 into vector<4x16x16xf16>
%1226 = vector.extract %129[1] : f16 from vector<16xf16>
%1227 = vector.insert %1226, %1225 [2, 1, 9] : f16 into vector<4x16x16xf16>
%1228 = vector.extract %130[1] : f16 from vector<16xf16>
%1229 = vector.insert %1228, %1227 [2, 1, 10] : f16 into vector<4x16x16xf16>
%1230 = vector.extract %131[1] : f16 from vector<16xf16>
%1231 = vector.insert %1230, %1229 [2, 1, 11] : f16 into vector<4x16x16xf16>
%1232 = vector.extract %132[1] : f16 from vector<16xf16>
%1233 = vector.insert %1232, %1231 [2, 1, 12] : f16 into vector<4x16x16xf16>
%1234 = vector.extract %133[1] : f16 from vector<16xf16>
%1235 = vector.insert %1234, %1233 [2, 1, 13] : f16 into vector<4x16x16xf16>
%1236 = vector.extract %134[1] : f16 from vector<16xf16>
%1237 = vector.insert %1236, %1235 [2, 1, 14] : f16 into vector<4x16x16xf16>
%1238 = vector.extract %135[1] : f16 from vector<16xf16>
%1239 = vector.insert %1238, %1237 [2, 1, 15] : f16 into vector<4x16x16xf16>
%1240 = vector.extract %120[2] : f16 from vector<16xf16>
%1241 = vector.insert %1240, %1239 [2, 2, 0] : f16 into vector<4x16x16xf16>
%1242 = vector.extract %121[2] : f16 from vector<16xf16>
%1243 = vector.insert %1242, %1241 [2, 2, 1] : f16 into vector<4x16x16xf16>
%1244 = vector.extract %122[2] : f16 from vector<16xf16>
%1245 = vector.insert %1244, %1243 [2, 2, 2] : f16 into vector<4x16x16xf16>
%1246 = vector.extract %123[2] : f16 from vector<16xf16>
%1247 = vector.insert %1246, %1245 [2, 2, 3] : f16 into vector<4x16x16xf16>
%1248 = vector.extract %124[2] : f16 from vector<16xf16>
%1249 = vector.insert %1248, %1247 [2, 2, 4] : f16 into vector<4x16x16xf16>
%1250 = vector.extract %125[2] : f16 from vector<16xf16>
%1251 = vector.insert %1250, %1249 [2, 2, 5] : f16 into vector<4x16x16xf16>
%1252 = vector.extract %126[2] : f16 from vector<16xf16>
%1253 = vector.insert %1252, %1251 [2, 2, 6] : f16 into vector<4x16x16xf16>
%1254 = vector.extract %127[2] : f16 from vector<16xf16>
%1255 = vector.insert %1254, %1253 [2, 2, 7] : f16 into vector<4x16x16xf16>
%1256 = vector.extract %128[2] : f16 from vector<16xf16>
%1257 = vector.insert %1256, %1255 [2, 2, 8] : f16 into vector<4x16x16xf16>
%1258 = vector.extract %129[2] : f16 from vector<16xf16>
%1259 = vector.insert %1258, %1257 [2, 2, 9] : f16 into vector<4x16x16xf16>
%1260 = vector.extract %130[2] : f16 from vector<16xf16>
%1261 = vector.insert %1260, %1259 [2, 2, 10] : f16 into vector<4x16x16xf16>
%1262 = vector.extract %131[2] : f16 from vector<16xf16>
%1263 = vector.insert %1262, %1261 [2, 2, 11] : f16 into vector<4x16x16xf16>
%1264 = vector.extract %132[2] : f16 from vector<16xf16>
%1265 = vector.insert %1264, %1263 [2, 2, 12] : f16 into vector<4x16x16xf16>
%1266 = vector.extract %133[2] : f16 from vector<16xf16>
%1267 = vector.insert %1266, %1265 [2, 2, 13] : f16 into vector<4x16x16xf16>
%1268 = vector.extract %134[2] : f16 from vector<16xf16>
%1269 = vector.insert %1268, %1267 [2, 2, 14] : f16 into vector<4x16x16xf16>
%1270 = vector.extract %135[2] : f16 from vector<16xf16>
%1271 = vector.insert %1270, %1269 [2, 2, 15] : f16 into vector<4x16x16xf16>
%1272 = vector.extract %120[3] : f16 from vector<16xf16>
%1273 = vector.insert %1272, %1271 [2, 3, 0] : f16 into vector<4x16x16xf16>
%1274 = vector.extract %121[3] : f16 from vector<16xf16>
%1275 = vector.insert %1274, %1273 [2, 3, 1] : f16 into vector<4x16x16xf16>
%1276 = vector.extract %122[3] : f16 from vector<16xf16>
%1277 = vector.insert %1276, %1275 [2, 3, 2] : f16 into vector<4x16x16xf16>
%1278 = vector.extract %123[3] : f16 from vector<16xf16>
%1279 = vector.insert %1278, %1277 [2, 3, 3] : f16 into vector<4x16x16xf16>
%1280 = vector.extract %124[3] : f16 from vector<16xf16>
%1281 = vector.insert %1280, %1279 [2, 3, 4] : f16 into vector<4x16x16xf16>
%1282 = vector.extract %125[3] : f16 from vector<16xf16>
%1283 = vector.insert %1282, %1281 [2, 3, 5] : f16 into vector<4x16x16xf16>
%1284 = vector.extract %126[3] : f16 from vector<16xf16>
%1285 = vector.insert %1284, %1283 [2, 3, 6] : f16 into vector<4x16x16xf16>
%1286 = vector.extract %127[3] : f16 from vector<16xf16>
%1287 = vector.insert %1286, %1285 [2, 3, 7] : f16 into vector<4x16x16xf16>
%1288 = vector.extract %128[3] : f16 from vector<16xf16>
%1289 = vector.insert %1288, %1287 [2, 3, 8] : f16 into vector<4x16x16xf16>
%1290 = vector.extract %129[3] : f16 from vector<16xf16>
%1291 = vector.insert %1290, %1289 [2, 3, 9] : f16 into vector<4x16x16xf16>
%1292 = vector.extract %130[3] : f16 from vector<16xf16>
%1293 = vector.insert %1292, %1291 [2, 3, 10] : f16 into vector<4x16x16xf16>
%1294 = vector.extract %131[3] : f16 from vector<16xf16>
%1295 = vector.insert %1294, %1293 [2, 3, 11] : f16 into vector<4x16x16xf16>
%1296 = vector.extract %132[3] : f16 from vector<16xf16>
%1297 = vector.insert %1296, %1295 [2, 3, 12] : f16 into vector<4x16x16xf16>
%1298 = vector.extract %133[3] : f16 from vector<16xf16>
%1299 = vector.insert %1298, %1297 [2, 3, 13] : f16 into vector<4x16x16xf16>
%1300 = vector.extract %134[3] : f16 from vector<16xf16>
%1301 = vector.insert %1300, %1299 [2, 3, 14] : f16 into vector<4x16x16xf16>
%1302 = vector.extract %135[3] : f16 from vector<16xf16>
%1303 = vector.insert %1302, %1301 [2, 3, 15] : f16 into vector<4x16x16xf16>
%1304 = vector.extract %120[4] : f16 from vector<16xf16>
%1305 = vector.insert %1304, %1303 [2, 4, 0] : f16 into vector<4x16x16xf16>
%1306 = vector.extract %121[4] : f16 from vector<16xf16>
%1307 = vector.insert %1306, %1305 [2, 4, 1] : f16 into vector<4x16x16xf16>
%1308 = vector.extract %122[4] : f16 from vector<16xf16>
%1309 = vector.insert %1308, %1307 [2, 4, 2] : f16 into vector<4x16x16xf16>
%1310 = vector.extract %123[4] : f16 from vector<16xf16>
%1311 = vector.insert %1310, %1309 [2, 4, 3] : f16 into vector<4x16x16xf16>
%1312 = vector.extract %124[4] : f16 from vector<16xf16>
%1313 = vector.insert %1312, %1311 [2, 4, 4] : f16 into vector<4x16x16xf16>
%1314 = vector.extract %125[4] : f16 from vector<16xf16>
%1315 = vector.insert %1314, %1313 [2, 4, 5] : f16 into vector<4x16x16xf16>
%1316 = vector.extract %126[4] : f16 from vector<16xf16>
%1317 = vector.insert %1316, %1315 [2, 4, 6] : f16 into vector<4x16x16xf16>
%1318 = vector.extract %127[4] : f16 from vector<16xf16>
%1319 = vector.insert %1318, %1317 [2, 4, 7] : f16 into vector<4x16x16xf16>
%1320 = vector.extract %128[4] : f16 from vector<16xf16>
%1321 = vector.insert %1320, %1319 [2, 4, 8] : f16 into vector<4x16x16xf16>
%1322 = vector.extract %129[4] : f16 from vector<16xf16>
%1323 = vector.insert %1322, %1321 [2, 4, 9] : f16 into vector<4x16x16xf16>
%1324 = vector.extract %130[4] : f16 from vector<16xf16>
%1325 = vector.insert %1324, %1323 [2, 4, 10] : f16 into vector<4x16x16xf16>
%1326 = vector.extract %131[4] : f16 from vector<16xf16>
%1327 = vector.insert %1326, %1325 [2, 4, 11] : f16 into vector<4x16x16xf16>
%1328 = vector.extract %132[4] : f16 from vector<16xf16>
%1329 = vector.insert %1328, %1327 [2, 4, 12] : f16 into vector<4x16x16xf16>
%1330 = vector.extract %133[4] : f16 from vector<16xf16>
%1331 = vector.insert %1330, %1329 [2, 4, 13] : f16 into vector<4x16x16xf16>
%1332 = vector.extract %134[4] : f16 from vector<16xf16>
%1333 = vector.insert %1332, %1331 [2, 4, 14] : f16 into vector<4x16x16xf16>
%1334 = vector.extract %135[4] : f16 from vector<16xf16>
%1335 = vector.insert %1334, %1333 [2, 4, 15] : f16 into vector<4x16x16xf16>
%1336 = vector.extract %120[5] : f16 from vector<16xf16>
%1337 = vector.insert %1336, %1335 [2, 5, 0] : f16 into vector<4x16x16xf16>
%1338 = vector.extract %121[5] : f16 from vector<16xf16>
%1339 = vector.insert %1338, %1337 [2, 5, 1] : f16 into vector<4x16x16xf16>
%1340 = vector.extract %122[5] : f16 from vector<16xf16>
%1341 = vector.insert %1340, %1339 [2, 5, 2] : f16 into vector<4x16x16xf16>
%1342 = vector.extract %123[5] : f16 from vector<16xf16>
%1343 = vector.insert %1342, %1341 [2, 5, 3] : f16 into vector<4x16x16xf16>
%1344 = vector.extract %124[5] : f16 from vector<16xf16>
%1345 = vector.insert %1344, %1343 [2, 5, 4] : f16 into vector<4x16x16xf16>
%1346 = vector.extract %125[5] : f16 from vector<16xf16>
%1347 = vector.insert %1346, %1345 [2, 5, 5] : f16 into vector<4x16x16xf16>
%1348 = vector.extract %126[5] : f16 from vector<16xf16>
%1349 = vector.insert %1348, %1347 [2, 5, 6] : f16 into vector<4x16x16xf16>
%1350 = vector.extract %127[5] : f16 from vector<16xf16>
%1351 = vector.insert %1350, %1349 [2, 5, 7] : f16 into vector<4x16x16xf16>
%1352 = vector.extract %128[5] : f16 from vector<16xf16>
%1353 = vector.insert %1352, %1351 [2, 5, 8] : f16 into vector<4x16x16xf16>
%1354 = vector.extract %129[5] : f16 from vector<16xf16>
%1355 = vector.insert %1354, %1353 [2, 5, 9] : f16 into vector<4x16x16xf16>
%1356 = vector.extract %130[5] : f16 from vector<16xf16>
%1357 = vector.insert %1356, %1355 [2, 5, 10] : f16 into vector<4x16x16xf16>
%1358 = vector.extract %131[5] : f16 from vector<16xf16>
%1359 = vector.insert %1358, %1357 [2, 5, 11] : f16 into vector<4x16x16xf16>
%1360 = vector.extract %132[5] : f16 from vector<16xf16>
%1361 = vector.insert %1360, %1359 [2, 5, 12] : f16 into vector<4x16x16xf16>
%1362 = vector.extract %133[5] : f16 from vector<16xf16>
%1363 = vector.insert %1362, %1361 [2, 5, 13] : f16 into vector<4x16x16xf16>
%1364 = vector.extract %134[5] : f16 from vector<16xf16>
%1365 = vector.insert %1364, %1363 [2, 5, 14] : f16 into vector<4x16x16xf16>
%1366 = vector.extract %135[5] : f16 from vector<16xf16>
%1367 = vector.insert %1366, %1365 [2, 5, 15] : f16 into vector<4x16x16xf16>
%1368 = vector.extract %120[6] : f16 from vector<16xf16>
%1369 = vector.insert %1368, %1367 [2, 6, 0] : f16 into vector<4x16x16xf16>
%1370 = vector.extract %121[6] : f16 from vector<16xf16>
%1371 = vector.insert %1370, %1369 [2, 6, 1] : f16 into vector<4x16x16xf16>
%1372 = vector.extract %122[6] : f16 from vector<16xf16>
%1373 = vector.insert %1372, %1371 [2, 6, 2] : f16 into vector<4x16x16xf16>
%1374 = vector.extract %123[6] : f16 from vector<16xf16>
%1375 = vector.insert %1374, %1373 [2, 6, 3] : f16 into vector<4x16x16xf16>
%1376 = vector.extract %124[6] : f16 from vector<16xf16>
%1377 = vector.insert %1376, %1375 [2, 6, 4] : f16 into vector<4x16x16xf16>
%1378 = vector.extract %125[6] : f16 from vector<16xf16>
%1379 = vector.insert %1378, %1377 [2, 6, 5] : f16 into vector<4x16x16xf16>
%1380 = vector.extract %126[6] : f16 from vector<16xf16>
%1381 = vector.insert %1380, %1379 [2, 6, 6] : f16 into vector<4x16x16xf16>
%1382 = vector.extract %127[6] : f16 from vector<16xf16>
%1383 = vector.insert %1382, %1381 [2, 6, 7] : f16 into vector<4x16x16xf16>
%1384 = vector.extract %128[6] : f16 from vector<16xf16>
%1385 = vector.insert %1384, %1383 [2, 6, 8] : f16 into vector<4x16x16xf16>
%1386 = vector.extract %129[6] : f16 from vector<16xf16>
%1387 = vector.insert %1386, %1385 [2, 6, 9] : f16 into vector<4x16x16xf16>
%1388 = vector.extract %130[6] : f16 from vector<16xf16>
%1389 = vector.insert %1388, %1387 [2, 6, 10] : f16 into vector<4x16x16xf16>
%1390 = vector.extract %131[6] : f16 from vector<16xf16>
%1391 = vector.insert %1390, %1389 [2, 6, 11] : f16 into vector<4x16x16xf16>
%1392 = vector.extract %132[6] : f16 from vector<16xf16>
%1393 = vector.insert %1392, %1391 [2, 6, 12] : f16 into vector<4x16x16xf16>
%1394 = vector.extract %133[6] : f16 from vector<16xf16>
%1395 = vector.insert %1394, %1393 [2, 6, 13] : f16 into vector<4x16x16xf16>
%1396 = vector.extract %134[6] : f16 from vector<16xf16>
%1397 = vector.insert %1396, %1395 [2, 6, 14] : f16 into vector<4x16x16xf16>
%1398 = vector.extract %135[6] : f16 from vector<16xf16>
%1399 = vector.insert %1398, %1397 [2, 6, 15] : f16 into vector<4x16x16xf16>
%1400 = vector.extract %120[7] : f16 from vector<16xf16>
%1401 = vector.insert %1400, %1399 [2, 7, 0] : f16 into vector<4x16x16xf16>
%1402 = vector.extract %121[7] : f16 from vector<16xf16>
%1403 = vector.insert %1402, %1401 [2, 7, 1] : f16 into vector<4x16x16xf16>
%1404 = vector.extract %122[7] : f16 from vector<16xf16>
%1405 = vector.insert %1404, %1403 [2, 7, 2] : f16 into vector<4x16x16xf16>
%1406 = vector.extract %123[7] : f16 from vector<16xf16>
%1407 = vector.insert %1406, %1405 [2, 7, 3] : f16 into vector<4x16x16xf16>
%1408 = vector.extract %124[7] : f16 from vector<16xf16>
%1409 = vector.insert %1408, %1407 [2, 7, 4] : f16 into vector<4x16x16xf16>
%1410 = vector.extract %125[7] : f16 from vector<16xf16>
%1411 = vector.insert %1410, %1409 [2, 7, 5] : f16 into vector<4x16x16xf16>
%1412 = vector.extract %126[7] : f16 from vector<16xf16>
%1413 = vector.insert %1412, %1411 [2, 7, 6] : f16 into vector<4x16x16xf16>
%1414 = vector.extract %127[7] : f16 from vector<16xf16>
%1415 = vector.insert %1414, %1413 [2, 7, 7] : f16 into vector<4x16x16xf16>
%1416 = vector.extract %128[7] : f16 from vector<16xf16>
%1417 = vector.insert %1416, %1415 [2, 7, 8] : f16 into vector<4x16x16xf16>
%1418 = vector.extract %129[7] : f16 from vector<16xf16>
%1419 = vector.insert %1418, %1417 [2, 7, 9] : f16 into vector<4x16x16xf16>
%1420 = vector.extract %130[7] : f16 from vector<16xf16>
%1421 = vector.insert %1420, %1419 [2, 7, 10] : f16 into vector<4x16x16xf16>
%1422 = vector.extract %131[7] : f16 from vector<16xf16>
%1423 = vector.insert %1422, %1421 [2, 7, 11] : f16 into vector<4x16x16xf16>
%1424 = vector.extract %132[7] : f16 from vector<16xf16>
%1425 = vector.insert %1424, %1423 [2, 7, 12] : f16 into vector<4x16x16xf16>
%1426 = vector.extract %133[7] : f16 from vector<16xf16>
%1427 = vector.insert %1426, %1425 [2, 7, 13] : f16 into vector<4x16x16xf16>
%1428 = vector.extract %134[7] : f16 from vector<16xf16>
%1429 = vector.insert %1428, %1427 [2, 7, 14] : f16 into vector<4x16x16xf16>
%1430 = vector.extract %135[7] : f16 from vector<16xf16>
%1431 = vector.insert %1430, %1429 [2, 7, 15] : f16 into vector<4x16x16xf16>
%1432 = vector.extract %120[8] : f16 from vector<16xf16>
%1433 = vector.insert %1432, %1431 [2, 8, 0] : f16 into vector<4x16x16xf16>
%1434 = vector.extract %121[8] : f16 from vector<16xf16>
%1435 = vector.insert %1434, %1433 [2, 8, 1] : f16 into vector<4x16x16xf16>
%1436 = vector.extract %122[8] : f16 from vector<16xf16>
%1437 = vector.insert %1436, %1435 [2, 8, 2] : f16 into vector<4x16x16xf16>
%1438 = vector.extract %123[8] : f16 from vector<16xf16>
%1439 = vector.insert %1438, %1437 [2, 8, 3] : f16 into vector<4x16x16xf16>
%1440 = vector.extract %124[8] : f16 from vector<16xf16>
%1441 = vector.insert %1440, %1439 [2, 8, 4] : f16 into vector<4x16x16xf16>
%1442 = vector.extract %125[8] : f16 from vector<16xf16>
%1443 = vector.insert %1442, %1441 [2, 8, 5] : f16 into vector<4x16x16xf16>
%1444 = vector.extract %126[8] : f16 from vector<16xf16>
%1445 = vector.insert %1444, %1443 [2, 8, 6] : f16 into vector<4x16x16xf16>
%1446 = vector.extract %127[8] : f16 from vector<16xf16>
%1447 = vector.insert %1446, %1445 [2, 8, 7] : f16 into vector<4x16x16xf16>
%1448 = vector.extract %128[8] : f16 from vector<16xf16>
%1449 = vector.insert %1448, %1447 [2, 8, 8] : f16 into vector<4x16x16xf16>
%1450 = vector.extract %129[8] : f16 from vector<16xf16>
%1451 = vector.insert %1450, %1449 [2, 8, 9] : f16 into vector<4x16x16xf16>
%1452 = vector.extract %130[8] : f16 from vector<16xf16>
%1453 = vector.insert %1452, %1451 [2, 8, 10] : f16 into vector<4x16x16xf16>
%1454 = vector.extract %131[8] : f16 from vector<16xf16>
%1455 = vector.insert %1454, %1453 [2, 8, 11] : f16 into vector<4x16x16xf16>
%1456 = vector.extract %132[8] : f16 from vector<16xf16>
%1457 = vector.insert %1456, %1455 [2, 8, 12] : f16 into vector<4x16x16xf16>
%1458 = vector.extract %133[8] : f16 from vector<16xf16>
%1459 = vector.insert %1458, %1457 [2, 8, 13] : f16 into vector<4x16x16xf16>
%1460 = vector.extract %134[8] : f16 from vector<16xf16>
%1461 = vector.insert %1460, %1459 [2, 8, 14] : f16 into vector<4x16x16xf16>
%1462 = vector.extract %135[8] : f16 from vector<16xf16>
%1463 = vector.insert %1462, %1461 [2, 8, 15] : f16 into vector<4x16x16xf16>
%1464 = vector.extract %120[9] : f16 from vector<16xf16>
%1465 = vector.insert %1464, %1463 [2, 9, 0] : f16 into vector<4x16x16xf16>
%1466 = vector.extract %121[9] : f16 from vector<16xf16>
%1467 = vector.insert %1466, %1465 [2, 9, 1] : f16 into vector<4x16x16xf16>
%1468 = vector.extract %122[9] : f16 from vector<16xf16>
%1469 = vector.insert %1468, %1467 [2, 9, 2] : f16 into vector<4x16x16xf16>
%1470 = vector.extract %123[9] : f16 from vector<16xf16>
%1471 = vector.insert %1470, %1469 [2, 9, 3] : f16 into vector<4x16x16xf16>
%1472 = vector.extract %124[9] : f16 from vector<16xf16>
%1473 = vector.insert %1472, %1471 [2, 9, 4] : f16 into vector<4x16x16xf16>
%1474 = vector.extract %125[9] : f16 from vector<16xf16>
%1475 = vector.insert %1474, %1473 [2, 9, 5] : f16 into vector<4x16x16xf16>
%1476 = vector.extract %126[9] : f16 from vector<16xf16>
%1477 = vector.insert %1476, %1475 [2, 9, 6] : f16 into vector<4x16x16xf16>
%1478 = vector.extract %127[9] : f16 from vector<16xf16>
%1479 = vector.insert %1478, %1477 [2, 9, 7] : f16 into vector<4x16x16xf16>
%1480 = vector.extract %128[9] : f16 from vector<16xf16>
%1481 = vector.insert %1480, %1479 [2, 9, 8] : f16 into vector<4x16x16xf16>
%1482 = vector.extract %129[9] : f16 from vector<16xf16>
%1483 = vector.insert %1482, %1481 [2, 9, 9] : f16 into vector<4x16x16xf16>
%1484 = vector.extract %130[9] : f16 from vector<16xf16>
%1485 = vector.insert %1484, %1483 [2, 9, 10] : f16 into vector<4x16x16xf16>
%1486 = vector.extract %131[9] : f16 from vector<16xf16>
%1487 = vector.insert %1486, %1485 [2, 9, 11] : f16 into vector<4x16x16xf16>
%1488 = vector.extract %132[9] : f16 from vector<16xf16>
%1489 = vector.insert %1488, %1487 [2, 9, 12] : f16 into vector<4x16x16xf16>
%1490 = vector.extract %133[9] : f16 from vector<16xf16>
%1491 = vector.insert %1490, %1489 [2, 9, 13] : f16 into vector<4x16x16xf16>
%1492 = vector.extract %134[9] : f16 from vector<16xf16>
%1493 = vector.insert %1492, %1491 [2, 9, 14] : f16 into vector<4x16x16xf16>
%1494 = vector.extract %135[9] : f16 from vector<16xf16>
%1495 = vector.insert %1494, %1493 [2, 9, 15] : f16 into vector<4x16x16xf16>
%1496 = vector.extract %120[10] : f16 from vector<16xf16>
%1497 = vector.insert %1496, %1495 [2, 10, 0] : f16 into vector<4x16x16xf16>
%1498 = vector.extract %121[10] : f16 from vector<16xf16>
%1499 = vector.insert %1498, %1497 [2, 10, 1] : f16 into vector<4x16x16xf16>
%1500 = vector.extract %122[10] : f16 from vector<16xf16>
%1501 = vector.insert %1500, %1499 [2, 10, 2] : f16 into vector<4x16x16xf16>
%1502 = vector.extract %123[10] : f16 from vector<16xf16>
%1503 = vector.insert %1502, %1501 [2, 10, 3] : f16 into vector<4x16x16xf16>
%1504 = vector.extract %124[10] : f16 from vector<16xf16>
%1505 = vector.insert %1504, %1503 [2, 10, 4] : f16 into vector<4x16x16xf16>
%1506 = vector.extract %125[10] : f16 from vector<16xf16>
%1507 = vector.insert %1506, %1505 [2, 10, 5] : f16 into vector<4x16x16xf16>
%1508 = vector.extract %126[10] : f16 from vector<16xf16>
%1509 = vector.insert %1508, %1507 [2, 10, 6] : f16 into vector<4x16x16xf16>
%1510 = vector.extract %127[10] : f16 from vector<16xf16>
%1511 = vector.insert %1510, %1509 [2, 10, 7] : f16 into vector<4x16x16xf16>
%1512 = vector.extract %128[10] : f16 from vector<16xf16>
%1513 = vector.insert %1512, %1511 [2, 10, 8] : f16 into vector<4x16x16xf16>
%1514 = vector.extract %129[10] : f16 from vector<16xf16>
%1515 = vector.insert %1514, %1513 [2, 10, 9] : f16 into vector<4x16x16xf16>
%1516 = vector.extract %130[10] : f16 from vector<16xf16>
%1517 = vector.insert %1516, %1515 [2, 10, 10] : f16 into vector<4x16x16xf16>
%1518 = vector.extract %131[10] : f16 from vector<16xf16>
%1519 = vector.insert %1518, %1517 [2, 10, 11] : f16 into vector<4x16x16xf16>
%1520 = vector.extract %132[10] : f16 from vector<16xf16>
%1521 = vector.insert %1520, %1519 [2, 10, 12] : f16 into vector<4x16x16xf16>
%1522 = vector.extract %133[10] : f16 from vector<16xf16>
%1523 = vector.insert %1522, %1521 [2, 10, 13] : f16 into vector<4x16x16xf16>
%1524 = vector.extract %134[10] : f16 from vector<16xf16>
%1525 = vector.insert %1524, %1523 [2, 10, 14] : f16 into vector<4x16x16xf16>
%1526 = vector.extract %135[10] : f16 from vector<16xf16>
%1527 = vector.insert %1526, %1525 [2, 10, 15] : f16 into vector<4x16x16xf16>
%1528 = vector.extract %120[11] : f16 from vector<16xf16>
%1529 = vector.insert %1528, %1527 [2, 11, 0] : f16 into vector<4x16x16xf16>
%1530 = vector.extract %121[11] : f16 from vector<16xf16>
%1531 = vector.insert %1530, %1529 [2, 11, 1] : f16 into vector<4x16x16xf16>
%1532 = vector.extract %122[11] : f16 from vector<16xf16>
%1533 = vector.insert %1532, %1531 [2, 11, 2] : f16 into vector<4x16x16xf16>
%1534 = vector.extract %123[11] : f16 from vector<16xf16>
%1535 = vector.insert %1534, %1533 [2, 11, 3] : f16 into vector<4x16x16xf16>
%1536 = vector.extract %124[11] : f16 from vector<16xf16>
%1537 = vector.insert %1536, %1535 [2, 11, 4] : f16 into vector<4x16x16xf16>
%1538 = vector.extract %125[11] : f16 from vector<16xf16>
%1539 = vector.insert %1538, %1537 [2, 11, 5] : f16 into vector<4x16x16xf16>
%1540 = vector.extract %126[11] : f16 from vector<16xf16>
%1541 = vector.insert %1540, %1539 [2, 11, 6] : f16 into vector<4x16x16xf16>
%1542 = vector.extract %127[11] : f16 from vector<16xf16>
%1543 = vector.insert %1542, %1541 [2, 11, 7] : f16 into vector<4x16x16xf16>
%1544 = vector.extract %128[11] : f16 from vector<16xf16>
%1545 = vector.insert %1544, %1543 [2, 11, 8] : f16 into vector<4x16x16xf16>
%1546 = vector.extract %129[11] : f16 from vector<16xf16>
%1547 = vector.insert %1546, %1545 [2, 11, 9] : f16 into vector<4x16x16xf16>
%1548 = vector.extract %130[11] : f16 from vector<16xf16>
%1549 = vector.insert %1548, %1547 [2, 11, 10] : f16 into vector<4x16x16xf16>
%1550 = vector.extract %131[11] : f16 from vector<16xf16>
%1551 = vector.insert %1550, %1549 [2, 11, 11] : f16 into vector<4x16x16xf16>
%1552 = vector.extract %132[11] : f16 from vector<16xf16>
%1553 = vector.insert %1552, %1551 [2, 11, 12] : f16 into vector<4x16x16xf16>
%1554 = vector.extract %133[11] : f16 from vector<16xf16>
%1555 = vector.insert %1554, %1553 [2, 11, 13] : f16 into vector<4x16x16xf16>
%1556 = vector.extract %134[11] : f16 from vector<16xf16>
%1557 = vector.insert %1556, %1555 [2, 11, 14] : f16 into vector<4x16x16xf16>
%1558 = vector.extract %135[11] : f16 from vector<16xf16>
%1559 = vector.insert %1558, %1557 [2, 11, 15] : f16 into vector<4x16x16xf16>
%1560 = vector.extract %120[12] : f16 from vector<16xf16>
%1561 = vector.insert %1560, %1559 [2, 12, 0] : f16 into vector<4x16x16xf16>
%1562 = vector.extract %121[12] : f16 from vector<16xf16>
%1563 = vector.insert %1562, %1561 [2, 12, 1] : f16 into vector<4x16x16xf16>
%1564 = vector.extract %122[12] : f16 from vector<16xf16>
%1565 = vector.insert %1564, %1563 [2, 12, 2] : f16 into vector<4x16x16xf16>
%1566 = vector.extract %123[12] : f16 from vector<16xf16>
%1567 = vector.insert %1566, %1565 [2, 12, 3] : f16 into vector<4x16x16xf16>
%1568 = vector.extract %124[12] : f16 from vector<16xf16>
%1569 = vector.insert %1568, %1567 [2, 12, 4] : f16 into vector<4x16x16xf16>
%1570 = vector.extract %125[12] : f16 from vector<16xf16>
%1571 = vector.insert %1570, %1569 [2, 12, 5] : f16 into vector<4x16x16xf16>
%1572 = vector.extract %126[12] : f16 from vector<16xf16>
%1573 = vector.insert %1572, %1571 [2, 12, 6] : f16 into vector<4x16x16xf16>
%1574 = vector.extract %127[12] : f16 from vector<16xf16>
%1575 = vector.insert %1574, %1573 [2, 12, 7] : f16 into vector<4x16x16xf16>
%1576 = vector.extract %128[12] : f16 from vector<16xf16>
%1577 = vector.insert %1576, %1575 [2, 12, 8] : f16 into vector<4x16x16xf16>
%1578 = vector.extract %129[12] : f16 from vector<16xf16>
%1579 = vector.insert %1578, %1577 [2, 12, 9] : f16 into vector<4x16x16xf16>
%1580 = vector.extract %130[12] : f16 from vector<16xf16>
%1581 = vector.insert %1580, %1579 [2, 12, 10] : f16 into vector<4x16x16xf16>
%1582 = vector.extract %131[12] : f16 from vector<16xf16>
%1583 = vector.insert %1582, %1581 [2, 12, 11] : f16 into vector<4x16x16xf16>
%1584 = vector.extract %132[12] : f16 from vector<16xf16>
%1585 = vector.insert %1584, %1583 [2, 12, 12] : f16 into vector<4x16x16xf16>
%1586 = vector.extract %133[12] : f16 from vector<16xf16>
%1587 = vector.insert %1586, %1585 [2, 12, 13] : f16 into vector<4x16x16xf16>
%1588 = vector.extract %134[12] : f16 from vector<16xf16>
%1589 = vector.insert %1588, %1587 [2, 12, 14] : f16 into vector<4x16x16xf16>
%1590 = vector.extract %135[12] : f16 from vector<16xf16>
%1591 = vector.insert %1590, %1589 [2, 12, 15] : f16 into vector<4x16x16xf16>
%1592 = vector.extract %120[13] : f16 from vector<16xf16>
%1593 = vector.insert %1592, %1591 [2, 13, 0] : f16 into vector<4x16x16xf16>
%1594 = vector.extract %121[13] : f16 from vector<16xf16>
%1595 = vector.insert %1594, %1593 [2, 13, 1] : f16 into vector<4x16x16xf16>
%1596 = vector.extract %122[13] : f16 from vector<16xf16>
%1597 = vector.insert %1596, %1595 [2, 13, 2] : f16 into vector<4x16x16xf16>
%1598 = vector.extract %123[13] : f16 from vector<16xf16>
%1599 = vector.insert %1598, %1597 [2, 13, 3] : f16 into vector<4x16x16xf16>
%1600 = vector.extract %124[13] : f16 from vector<16xf16>
%1601 = vector.insert %1600, %1599 [2, 13, 4] : f16 into vector<4x16x16xf16>
%1602 = vector.extract %125[13] : f16 from vector<16xf16>
%1603 = vector.insert %1602, %1601 [2, 13, 5] : f16 into vector<4x16x16xf16>
%1604 = vector.extract %126[13] : f16 from vector<16xf16>
%1605 = vector.insert %1604, %1603 [2, 13, 6] : f16 into vector<4x16x16xf16>
%1606 = vector.extract %127[13] : f16 from vector<16xf16>
%1607 = vector.insert %1606, %1605 [2, 13, 7] : f16 into vector<4x16x16xf16>
%1608 = vector.extract %128[13] : f16 from vector<16xf16>
%1609 = vector.insert %1608, %1607 [2, 13, 8] : f16 into vector<4x16x16xf16>
%1610 = vector.extract %129[13] : f16 from vector<16xf16>
%1611 = vector.insert %1610, %1609 [2, 13, 9] : f16 into vector<4x16x16xf16>
%1612 = vector.extract %130[13] : f16 from vector<16xf16>
%1613 = vector.insert %1612, %1611 [2, 13, 10] : f16 into vector<4x16x16xf16>
%1614 = vector.extract %131[13] : f16 from vector<16xf16>
%1615 = vector.insert %1614, %1613 [2, 13, 11] : f16 into vector<4x16x16xf16>
%1616 = vector.extract %132[13] : f16 from vector<16xf16>
%1617 = vector.insert %1616, %1615 [2, 13, 12] : f16 into vector<4x16x16xf16>
%1618 = vector.extract %133[13] : f16 from vector<16xf16>
%1619 = vector.insert %1618, %1617 [2, 13, 13] : f16 into vector<4x16x16xf16>
%1620 = vector.extract %134[13] : f16 from vector<16xf16>
%1621 = vector.insert %1620, %1619 [2, 13, 14] : f16 into vector<4x16x16xf16>
%1622 = vector.extract %135[13] : f16 from vector<16xf16>
%1623 = vector.insert %1622, %1621 [2, 13, 15] : f16 into vector<4x16x16xf16>
%1624 = vector.extract %120[14]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment