Created
February 9, 2023 05:02
-
-
Save pashu123/b3449d68be413ecc828a5892ee1ec598 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/home/prashant/stable.mlir:793:11: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
%10 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x160xf32>) outs(%8 : tensor<2x160xf32>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:793:11: note: see current operation: %20 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%10 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x160xf32>) outs(%8 : tensor<2x160xf32>) { | |
^ | |
/home/prashant/stable.mlir:793:11: note: see existing live user here: %28 = "spirv.UConvert"(%20) : (i32) -> i64 | |
/home/prashant/stable.mlir:804:27: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
%inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32> | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:804:27: note: see current operation: | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
%0 = "arith.constant"() {value = 5 : index} : () -> index | |
%1 = "arith.constant"() {value = 2 : index} : () -> index | |
%2 = "arith.constant"() {value = 1 : index} : () -> index | |
"hal.return"(%0, %1, %2) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_3", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = -1 : index} : () -> index | |
%1 = "arith.constant"() {value = 4 : index} : () -> index | |
%2 = "arith.constant"() {value = 32 : index} : () -> index | |
%3 = "arith.constant"() {value = 160 : index} : () -> index | |
%4 = "arith.constant"() {value = 0 : index} : () -> index | |
%5 = "arith.constant"() {value = 640 : index} : () -> index | |
%6 = "arith.constant"() {value = 320 : index} : () -> index | |
%7 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%8 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
%9 = "arith.index_castui"(%7) : (i32) -> index | |
%10 = "arith.index_castui"(%8) : (i32) -> index | |
%11 = "hal.interface.binding.subspan"(%9, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%12 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%13 = "hal.interface.binding.subspan"(%10, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%14 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%15 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%16 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
%17 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
%18 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
%19 = "arith.muli"(%16, %3) : (index, index) -> index | |
%20 = "arith.muli"(%18, %3) : (index, index) -> index | |
%21 = "arith.addi"(%19, %20) : (index, index) -> index | |
%22 = "arith.muli"(%15, %2) : (index, index) -> index | |
%23 = "arith.addi"(%21, %22) : (index, index) -> index | |
%24 = "arith.addi"(%23, %17) : (index, index) -> index | |
%25 = "arith.cmpi"(%9, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%26 = "arith.subi"(%0, %9) : (index, index) -> index | |
%27 = "arith.select"(%25, %26, %9) : (i1, index, index) -> index | |
%28 = "arith.divsi"(%27, %1) : (index, index) -> index | |
%29 = "arith.subi"(%0, %28) : (index, index) -> index | |
%30 = "arith.select"(%25, %29, %28) : (i1, index, index) -> index | |
%31 = "arith.addi"(%24, %30) : (index, index) -> index | |
%32 = "memref.load"(%12, %31) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%33 = "arith.muli"(%16, %6) : (index, index) -> index | |
%34 = "arith.muli"(%18, %6) : (index, index) -> index | |
%35 = "arith.addi"(%33, %34) : (index, index) -> index | |
%36 = "arith.addi"(%35, %22) : (index, index) -> index | |
%37 = "arith.addi"(%36, %17) : (index, index) -> index | |
%38 = "arith.cmpi"(%10, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%39 = "arith.subi"(%0, %10) : (index, index) -> index | |
%40 = "arith.select"(%38, %39, %10) : (i1, index, index) -> index | |
%41 = "arith.divsi"(%40, %1) : (index, index) -> index | |
%42 = "arith.subi"(%0, %41) : (index, index) -> index | |
%43 = "arith.select"(%38, %42, %41) : (i1, index, index) -> index | |
%44 = "arith.addi"(%37, %43) : (index, index) -> index | |
"memref.store"(%32, %14, %44) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_3"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
%inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32> | |
^ | |
/home/prashant/stable.mlir:804:27: error: failed to serialize executables | |
%inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32> | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:804:27: note: see current operation: | |
"hal.executable"() ({ | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
%0 = "arith.constant"() {value = 5 : index} : () -> index | |
%1 = "arith.constant"() {value = 2 : index} : () -> index | |
%2 = "arith.constant"() {value = 1 : index} : () -> index | |
"hal.return"(%0, %1, %2) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_3", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = -1 : index} : () -> index | |
%1 = "arith.constant"() {value = 4 : index} : () -> index | |
%2 = "arith.constant"() {value = 32 : index} : () -> index | |
%3 = "arith.constant"() {value = 160 : index} : () -> index | |
%4 = "arith.constant"() {value = 0 : index} : () -> index | |
%5 = "arith.constant"() {value = 640 : index} : () -> index | |
%6 = "arith.constant"() {value = 320 : index} : () -> index | |
%7 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%8 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
%9 = "arith.index_castui"(%7) : (i32) -> index | |
%10 = "arith.index_castui"(%8) : (i32) -> index | |
%11 = "hal.interface.binding.subspan"(%9, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%12 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%13 = "hal.interface.binding.subspan"(%10, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%14 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%15 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%16 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
%17 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
%18 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
%19 = "arith.muli"(%16, %3) : (index, index) -> index | |
%20 = "arith.muli"(%18, %3) : (index, index) -> index | |
%21 = "arith.addi"(%19, %20) : (index, index) -> index | |
%22 = "arith.muli"(%15, %2) : (index, index) -> index | |
%23 = "arith.addi"(%21, %22) : (index, index) -> index | |
%24 = "arith.addi"(%23, %17) : (index, index) -> index | |
%25 = "arith.cmpi"(%9, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%26 = "arith.subi"(%0, %9) : (index, index) -> index | |
%27 = "arith.select"(%25, %26, %9) : (i1, index, index) -> index | |
%28 = "arith.divsi"(%27, %1) : (index, index) -> index | |
%29 = "arith.subi"(%0, %28) : (index, index) -> index | |
%30 = "arith.select"(%25, %29, %28) : (i1, index, index) -> index | |
%31 = "arith.addi"(%24, %30) : (index, index) -> index | |
%32 = "memref.load"(%12, %31) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%33 = "arith.muli"(%16, %6) : (index, index) -> index | |
%34 = "arith.muli"(%18, %6) : (index, index) -> index | |
%35 = "arith.addi"(%33, %34) : (index, index) -> index | |
%36 = "arith.addi"(%35, %22) : (index, index) -> index | |
%37 = "arith.addi"(%36, %17) : (index, index) -> index | |
%38 = "arith.cmpi"(%10, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%39 = "arith.subi"(%0, %10) : (index, index) -> index | |
%40 = "arith.select"(%38, %39, %10) : (i1, index, index) -> index | |
%41 = "arith.divsi"(%40, %1) : (index, index) -> index | |
%42 = "arith.subi"(%0, %41) : (index, index) -> index | |
%43 = "arith.select"(%38, %42, %41) : (i1, index, index) -> index | |
%44 = "arith.addi"(%37, %43) : (index, index) -> index | |
"memref.store"(%32, %14, %44) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_3"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
"hal.executable_end"() : () -> () | |
}) {sym_name = "forward_dispatch_3", sym_visibility = "private"} : () -> () | |
%inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32> | |
^ | |
/home/prashant/stable.mlir:866:11: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
%32 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:866:11: note: see current operation: %51 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%32 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
^ | |
/home/prashant/stable.mlir:866:11: note: see existing live user here: %59 = "spirv.UConvert"(%51) : (i32) -> i64 | |
/home/prashant/stable.mlir:866:11: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
%32 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:866:11: note: see current operation: | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
%0 = "arith.constant"() {value = 64 : index} : () -> index | |
%1 = "arith.constant"() {value = 1 : index} : () -> index | |
"hal.return"(%0, %1, %1) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_10_generic_64x92160", translation_info = #iree_codegen.translation_info<SPIRVSubgroupReduce>, workgroup_size = [512 : index, 1 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = 16 : index} : () -> index | |
%1 = "arith.constant"() {value = 23040 : index} : () -> index | |
%2 = "arith.constant"() {value = -1 : index} : () -> index | |
%3 = "arith.constant"() {value = 4 : index} : () -> index | |
%4 = "arith.constant"() {value = 0 : index} : () -> index | |
%5 = "arith.constant"() {value = 64 : index} : () -> index | |
%6 = "arith.constant"() {value = 1474560 : index} : () -> index | |
%7 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
%8 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
%9 = "arith.constant"() {value = 32 : i32} : () -> i32 | |
%10 = "arith.constant"() {value = 1 : i32} : () -> i32 | |
%11 = "arith.constant"() {value = 2 : i32} : () -> i32 | |
%12 = "arith.constant"() {value = 4 : i32} : () -> i32 | |
%13 = "arith.constant"() {value = 8 : i32} : () -> i32 | |
%14 = "arith.constant"() {value = 16 : i32} : () -> i32 | |
%15 = "arith.constant"() {value = 32 : index} : () -> index | |
%16 = "arith.constant"() {value = 15 : index} : () -> index | |
%17 = "arith.constant"() {value = 0 : i32} : () -> i32 | |
%18 = "arith.constant"() {value = 2048 : index} : () -> index | |
%19 = "arith.constant"() {value = 92160 : index} : () -> index | |
%20 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
%21 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%22 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
%23 = "arith.index_castui"(%21) : (i32) -> index | |
%24 = "arith.index_castui"(%22) : (i32) -> index | |
%25 = "hal.interface.binding.subspan"(%23, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%26 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%27 = "hal.interface.binding.subspan"(%24, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%28 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%29 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%30 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%31 = "scf.for"(%4, %19, %18, %8) ({ | |
^bb0(%arg0: index, %arg1: vector<4xf32>): | |
%70 = "arith.cmpi"(%arg0, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%71 = "arith.subi"(%2, %arg0) : (index, index) -> index | |
%72 = "arith.select"(%70, %71, %arg0) : (i1, index, index) -> index | |
%73 = "arith.divsi"(%72, %3) : (index, index) -> index | |
%74 = "arith.subi"(%2, %73) : (index, index) -> index | |
%75 = "arith.select"(%70, %74, %73) : (i1, index, index) -> index | |
%76 = "arith.muli"(%30, %1) : (index, index) -> index | |
%77 = "arith.addi"(%20, %76) : (index, index) -> index | |
%78 = "arith.addi"(%75, %77) : (index, index) -> index | |
%79 = "arith.cmpi"(%23, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%80 = "arith.subi"(%2, %23) : (index, index) -> index | |
%81 = "arith.select"(%79, %80, %23) : (i1, index, index) -> index | |
%82 = "arith.divsi"(%81, %0) : (index, index) -> index | |
%83 = "arith.subi"(%2, %82) : (index, index) -> index | |
%84 = "arith.select"(%79, %83, %82) : (i1, index, index) -> index | |
%85 = "arith.addi"(%78, %84) : (index, index) -> index | |
%86 = "memref.load"(%26, %85) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%87 = "arith.addf"(%86, %arg1) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
"scf.yield"(%87) : (vector<4xf32>) -> () | |
}) : (index, index, index, vector<4xf32>) -> vector<4xf32> | |
%32 = "arith.cmpi"(%24, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%33 = "arith.subi"(%2, %24) : (index, index) -> index | |
%34 = "arith.select"(%32, %33, %24) : (i1, index, index) -> index | |
%35 = "arith.divsi"(%34, %3) : (index, index) -> index | |
%36 = "arith.subi"(%2, %35) : (index, index) -> index | |
%37 = "arith.select"(%32, %36, %35) : (i1, index, index) -> index | |
%38 = "arith.addi"(%30, %37) : (index, index) -> index | |
%39 = "memref.load"(%28, %38) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%40 = "vector.insert"(%39, %7) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
%41 = "vector.extractelement"(%40, %4) : (vector<1xf32>, index) -> f32 | |
%42 = "vector.reduction"(%31) {kind = #vector.kind<add>} : (vector<4xf32>) -> f32 | |
%43:2 = "gpu.shuffle"(%42, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%44 = "arith.addf"(%42, %43#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%45:2 = "gpu.shuffle"(%44, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%46 = "arith.addf"(%44, %45#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%47:2 = "gpu.shuffle"(%46, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%48 = "arith.addf"(%46, %47#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%49:2 = "gpu.shuffle"(%48, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%50 = "arith.addf"(%48, %49#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%51:2 = "gpu.shuffle"(%50, %14, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%52 = "arith.addf"(%50, %51#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%53 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<16xf32, #spirv.storage_class<Workgroup>> | |
%54 = "arith.divui"(%20, %15) : (index, index) -> index | |
%55 = "arith.remui"(%20, %15) : (index, index) -> index | |
%56 = "arith.cmpi"(%55, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
"scf.if"(%56) ({ | |
"memref.store"(%52, %53, %54) {nontemporal = false} : (f32, memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> () | |
"scf.yield"() : () -> () | |
}, { | |
}) : (i1) -> () | |
"gpu.barrier"() : () -> () | |
%57 = "arith.minui"(%55, %16) : (index, index) -> index | |
%58 = "memref.load"(%53, %57) {nontemporal = false} : (memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> f32 | |
%59:2 = "gpu.shuffle"(%58, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%60 = "arith.addf"(%58, %59#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%61:2 = "gpu.shuffle"(%60, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%62 = "arith.addf"(%60, %61#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%63:2 = "gpu.shuffle"(%62, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%64 = "arith.addf"(%62, %63#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%65:2 = "gpu.shuffle"(%64, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%66 = "arith.addf"(%64, %65#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%67:2 = "gpu.shuffle"(%66, %17, %9) {mode = #gpu<shuffle_mode idx>} : (f32, i32, i32) -> (f32, i1) | |
%68 = "arith.addf"(%67#0, %41) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%69 = "arith.cmpi"(%20, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
"scf.if"(%69) ({ | |
"memref.store"(%68, %29, %38) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"scf.yield"() : () -> () | |
}, { | |
}) : (i1) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [512, 1, 1]>, sym_name = "forward_dispatch_10_generic_64x92160"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
%32 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
^ | |
/home/prashant/stable.mlir:866:11: error: failed to serialize executables | |
%32 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:866:11: note: see current operation: | |
"hal.executable"() ({ | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
%0 = "arith.constant"() {value = 64 : index} : () -> index | |
%1 = "arith.constant"() {value = 1 : index} : () -> index | |
"hal.return"(%0, %1, %1) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_10_generic_64x92160", translation_info = #iree_codegen.translation_info<SPIRVSubgroupReduce>, workgroup_size = [512 : index, 1 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = 16 : index} : () -> index | |
%1 = "arith.constant"() {value = 23040 : index} : () -> index | |
%2 = "arith.constant"() {value = -1 : index} : () -> index | |
%3 = "arith.constant"() {value = 4 : index} : () -> index | |
%4 = "arith.constant"() {value = 0 : index} : () -> index | |
%5 = "arith.constant"() {value = 64 : index} : () -> index | |
%6 = "arith.constant"() {value = 1474560 : index} : () -> index | |
%7 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
%8 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
%9 = "arith.constant"() {value = 32 : i32} : () -> i32 | |
%10 = "arith.constant"() {value = 1 : i32} : () -> i32 | |
%11 = "arith.constant"() {value = 2 : i32} : () -> i32 | |
%12 = "arith.constant"() {value = 4 : i32} : () -> i32 | |
%13 = "arith.constant"() {value = 8 : i32} : () -> i32 | |
%14 = "arith.constant"() {value = 16 : i32} : () -> i32 | |
%15 = "arith.constant"() {value = 32 : index} : () -> index | |
%16 = "arith.constant"() {value = 15 : index} : () -> index | |
%17 = "arith.constant"() {value = 0 : i32} : () -> i32 | |
%18 = "arith.constant"() {value = 2048 : index} : () -> index | |
%19 = "arith.constant"() {value = 92160 : index} : () -> index | |
%20 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
%21 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%22 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
%23 = "arith.index_castui"(%21) : (i32) -> index | |
%24 = "arith.index_castui"(%22) : (i32) -> index | |
%25 = "hal.interface.binding.subspan"(%23, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%26 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%27 = "hal.interface.binding.subspan"(%24, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%28 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%29 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%30 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%31 = "scf.for"(%4, %19, %18, %8) ({ | |
^bb0(%arg0: index, %arg1: vector<4xf32>): | |
%70 = "arith.cmpi"(%arg0, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%71 = "arith.subi"(%2, %arg0) : (index, index) -> index | |
%72 = "arith.select"(%70, %71, %arg0) : (i1, index, index) -> index | |
%73 = "arith.divsi"(%72, %3) : (index, index) -> index | |
%74 = "arith.subi"(%2, %73) : (index, index) -> index | |
%75 = "arith.select"(%70, %74, %73) : (i1, index, index) -> index | |
%76 = "arith.muli"(%30, %1) : (index, index) -> index | |
%77 = "arith.addi"(%20, %76) : (index, index) -> index | |
%78 = "arith.addi"(%75, %77) : (index, index) -> index | |
%79 = "arith.cmpi"(%23, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%80 = "arith.subi"(%2, %23) : (index, index) -> index | |
%81 = "arith.select"(%79, %80, %23) : (i1, index, index) -> index | |
%82 = "arith.divsi"(%81, %0) : (index, index) -> index | |
%83 = "arith.subi"(%2, %82) : (index, index) -> index | |
%84 = "arith.select"(%79, %83, %82) : (i1, index, index) -> index | |
%85 = "arith.addi"(%78, %84) : (index, index) -> index | |
%86 = "memref.load"(%26, %85) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%87 = "arith.addf"(%86, %arg1) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
"scf.yield"(%87) : (vector<4xf32>) -> () | |
}) : (index, index, index, vector<4xf32>) -> vector<4xf32> | |
%32 = "arith.cmpi"(%24, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%33 = "arith.subi"(%2, %24) : (index, index) -> index | |
%34 = "arith.select"(%32, %33, %24) : (i1, index, index) -> index | |
%35 = "arith.divsi"(%34, %3) : (index, index) -> index | |
%36 = "arith.subi"(%2, %35) : (index, index) -> index | |
%37 = "arith.select"(%32, %36, %35) : (i1, index, index) -> index | |
%38 = "arith.addi"(%30, %37) : (index, index) -> index | |
%39 = "memref.load"(%28, %38) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%40 = "vector.insert"(%39, %7) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
%41 = "vector.extractelement"(%40, %4) : (vector<1xf32>, index) -> f32 | |
%42 = "vector.reduction"(%31) {kind = #vector.kind<add>} : (vector<4xf32>) -> f32 | |
%43:2 = "gpu.shuffle"(%42, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%44 = "arith.addf"(%42, %43#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%45:2 = "gpu.shuffle"(%44, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%46 = "arith.addf"(%44, %45#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%47:2 = "gpu.shuffle"(%46, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%48 = "arith.addf"(%46, %47#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%49:2 = "gpu.shuffle"(%48, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%50 = "arith.addf"(%48, %49#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%51:2 = "gpu.shuffle"(%50, %14, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%52 = "arith.addf"(%50, %51#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%53 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<16xf32, #spirv.storage_class<Workgroup>> | |
%54 = "arith.divui"(%20, %15) : (index, index) -> index | |
%55 = "arith.remui"(%20, %15) : (index, index) -> index | |
%56 = "arith.cmpi"(%55, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
"scf.if"(%56) ({ | |
"memref.store"(%52, %53, %54) {nontemporal = false} : (f32, memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> () | |
"scf.yield"() : () -> () | |
}, { | |
}) : (i1) -> () | |
"gpu.barrier"() : () -> () | |
%57 = "arith.minui"(%55, %16) : (index, index) -> index | |
%58 = "memref.load"(%53, %57) {nontemporal = false} : (memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> f32 | |
%59:2 = "gpu.shuffle"(%58, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%60 = "arith.addf"(%58, %59#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%61:2 = "gpu.shuffle"(%60, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%62 = "arith.addf"(%60, %61#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%63:2 = "gpu.shuffle"(%62, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%64 = "arith.addf"(%62, %63#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%65:2 = "gpu.shuffle"(%64, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%66 = "arith.addf"(%64, %65#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%67:2 = "gpu.shuffle"(%66, %17, %9) {mode = #gpu<shuffle_mode idx>} : (f32, i32, i32) -> (f32, i1) | |
%68 = "arith.addf"(%67#0, %41) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%69 = "arith.cmpi"(%20, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
"scf.if"(%69) ({ | |
"memref.store"(%68, %29, %38) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"scf.yield"() : () -> () | |
}, { | |
}) : (i1) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [512, 1, 1]>, sym_name = "forward_dispatch_10_generic_64x92160"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
"hal.executable_end"() : () -> () | |
}) {sym_name = "forward_dispatch_10", sym_visibility = "private"} : () -> () | |
%32 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
^ | |
/home/prashant/stable.mlir:876:11: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
%34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:876:11: note: see current operation: %30 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
^ | |
/home/prashant/stable.mlir:876:11: note: see existing live user here: %45 = "spirv.UConvert"(%30) : (i32) -> i64 | |
/home/prashant/stable.mlir:876:11: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
%34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:876:11: note: see current operation: | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%0 = "arith.constant"() {value = 72 : index} : () -> index | |
%1 = "arith.constant"() {value = 10 : index} : () -> index | |
%2 = "arith.constant"() {value = 64 : index} : () -> index | |
"hal.return"(%0, %1, %2) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 3, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_11_generic_64x10x9216", translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0__0", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = 4 : index} : () -> index | |
%1 = "arith.constant"() {value = -1 : index} : () -> index | |
%2 = "arith.constant"() {value = 16 : index} : () -> index | |
%3 = "arith.constant"() {value = 2304 : index} : () -> index | |
%4 = "arith.constant"() {value = 23040 : index} : () -> index | |
%5 = "arith.constant"() {value = 32 : index} : () -> index | |
%6 = "arith.constant"() {value = 0 : index} : () -> index | |
%7 = "arith.constant"() {value = 64 : index} : () -> index | |
%8 = "arith.constant"() {value = 1474560 : index} : () -> index | |
%9 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
%10 = "arith.constant"() {value = dense<9.216000e+04> : vector<1xf32>} : () -> vector<1xf32> | |
%11 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
%12 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%13 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
%14 = "hal.interface.constant.load"() {index = 2 : index} : () -> i32 | |
%15 = "arith.index_castui"(%12) : (i32) -> index | |
%16 = "arith.index_castui"(%13) : (i32) -> index | |
%17 = "arith.index_castui"(%14) : (i32) -> index | |
%18 = "hal.interface.binding.subspan"(%15, %8) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%19 = "hal.interface.binding.subspan"(%6, %8) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%20 = "hal.interface.binding.subspan"(%16, %7) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%21 = "hal.interface.binding.subspan"(%6, %7) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%22 = "hal.interface.binding.subspan"(%17, %8) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%23 = "hal.interface.binding.subspan"(%6, %8) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%24 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%25 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
%26 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
%27 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
%28 = "arith.muli"(%24, %5) : (index, index) -> index | |
%29 = "arith.addi"(%27, %28) : (index, index) -> index | |
%30 = "arith.muli"(%26, %4) : (index, index) -> index | |
%31 = "arith.addi"(%29, %30) : (index, index) -> index | |
%32 = "arith.muli"(%25, %3) : (index, index) -> index | |
%33 = "arith.addi"(%31, %32) : (index, index) -> index | |
%34 = "arith.cmpi"(%15, %6) {predicate = 2 : i64} : (index, index) -> i1 | |
%35 = "arith.subi"(%1, %15) : (index, index) -> index | |
%36 = "arith.select"(%34, %35, %15) : (i1, index, index) -> index | |
%37 = "arith.divsi"(%36, %2) : (index, index) -> index | |
%38 = "arith.subi"(%1, %37) : (index, index) -> index | |
%39 = "arith.select"(%34, %38, %37) : (i1, index, index) -> index | |
%40 = "arith.addi"(%33, %39) : (index, index) -> index | |
%41 = "memref.load"(%19, %40) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%42 = "arith.cmpi"(%16, %6) {predicate = 2 : i64} : (index, index) -> i1 | |
%43 = "arith.subi"(%1, %16) : (index, index) -> index | |
%44 = "arith.select"(%42, %43, %16) : (i1, index, index) -> index | |
%45 = "arith.divsi"(%44, %0) : (index, index) -> index | |
%46 = "arith.subi"(%1, %45) : (index, index) -> index | |
%47 = "arith.select"(%42, %46, %45) : (i1, index, index) -> index | |
%48 = "arith.addi"(%26, %47) : (index, index) -> index | |
%49 = "memref.load"(%21, %48) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%50 = "vector.insert"(%49, %9) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
%51 = "arith.divf"(%50, %10) {fastmath = #arith.fastmath<none>} : (vector<1xf32>, vector<1xf32>) -> vector<1xf32> | |
%52 = "vector.extract"(%51) {position = [0]} : (vector<1xf32>) -> f32 | |
%53 = "vector.insert"(%52, %11) {position = [0]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%54 = "vector.insert"(%52, %53) {position = [1]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%55 = "vector.insert"(%52, %54) {position = [2]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%56 = "vector.insert"(%52, %55) {position = [3]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%57 = "arith.subf"(%41, %56) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%58 = "arith.cmpi"(%17, %6) {predicate = 2 : i64} : (index, index) -> i1 | |
%59 = "arith.subi"(%1, %17) : (index, index) -> index | |
%60 = "arith.select"(%58, %59, %17) : (i1, index, index) -> index | |
%61 = "arith.divsi"(%60, %2) : (index, index) -> index | |
%62 = "arith.subi"(%1, %61) : (index, index) -> index | |
%63 = "arith.select"(%58, %62, %61) : (i1, index, index) -> index | |
%64 = "arith.addi"(%33, %63) : (index, index) -> index | |
"memref.store"(%57, %23, %64) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_11_generic_64x10x9216"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
%34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
^ | |
/home/prashant/stable.mlir:876:11: error: failed to serialize executables | |
%34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:876:11: note: see current operation: | |
"hal.executable"() ({ | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%0 = "arith.constant"() {value = 72 : index} : () -> index | |
%1 = "arith.constant"() {value = 10 : index} : () -> index | |
%2 = "arith.constant"() {value = 64 : index} : () -> index | |
"hal.return"(%0, %1, %2) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 3, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_11_generic_64x10x9216", translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0__0", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = 4 : index} : () -> index | |
%1 = "arith.constant"() {value = -1 : index} : () -> index | |
%2 = "arith.constant"() {value = 16 : index} : () -> index | |
%3 = "arith.constant"() {value = 2304 : index} : () -> index | |
%4 = "arith.constant"() {value = 23040 : index} : () -> index | |
%5 = "arith.constant"() {value = 32 : index} : () -> index | |
%6 = "arith.constant"() {value = 0 : index} : () -> index | |
%7 = "arith.constant"() {value = 64 : index} : () -> index | |
%8 = "arith.constant"() {value = 1474560 : index} : () -> index | |
%9 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
%10 = "arith.constant"() {value = dense<9.216000e+04> : vector<1xf32>} : () -> vector<1xf32> | |
%11 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
%12 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%13 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
%14 = "hal.interface.constant.load"() {index = 2 : index} : () -> i32 | |
%15 = "arith.index_castui"(%12) : (i32) -> index | |
%16 = "arith.index_castui"(%13) : (i32) -> index | |
%17 = "arith.index_castui"(%14) : (i32) -> index | |
%18 = "hal.interface.binding.subspan"(%15, %8) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%19 = "hal.interface.binding.subspan"(%6, %8) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%20 = "hal.interface.binding.subspan"(%16, %7) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%21 = "hal.interface.binding.subspan"(%6, %7) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%22 = "hal.interface.binding.subspan"(%17, %8) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%23 = "hal.interface.binding.subspan"(%6, %8) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%24 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%25 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
%26 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
%27 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
%28 = "arith.muli"(%24, %5) : (index, index) -> index | |
%29 = "arith.addi"(%27, %28) : (index, index) -> index | |
%30 = "arith.muli"(%26, %4) : (index, index) -> index | |
%31 = "arith.addi"(%29, %30) : (index, index) -> index | |
%32 = "arith.muli"(%25, %3) : (index, index) -> index | |
%33 = "arith.addi"(%31, %32) : (index, index) -> index | |
%34 = "arith.cmpi"(%15, %6) {predicate = 2 : i64} : (index, index) -> i1 | |
%35 = "arith.subi"(%1, %15) : (index, index) -> index | |
%36 = "arith.select"(%34, %35, %15) : (i1, index, index) -> index | |
%37 = "arith.divsi"(%36, %2) : (index, index) -> index | |
%38 = "arith.subi"(%1, %37) : (index, index) -> index | |
%39 = "arith.select"(%34, %38, %37) : (i1, index, index) -> index | |
%40 = "arith.addi"(%33, %39) : (index, index) -> index | |
%41 = "memref.load"(%19, %40) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%42 = "arith.cmpi"(%16, %6) {predicate = 2 : i64} : (index, index) -> i1 | |
%43 = "arith.subi"(%1, %16) : (index, index) -> index | |
%44 = "arith.select"(%42, %43, %16) : (i1, index, index) -> index | |
%45 = "arith.divsi"(%44, %0) : (index, index) -> index | |
%46 = "arith.subi"(%1, %45) : (index, index) -> index | |
%47 = "arith.select"(%42, %46, %45) : (i1, index, index) -> index | |
%48 = "arith.addi"(%26, %47) : (index, index) -> index | |
%49 = "memref.load"(%21, %48) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%50 = "vector.insert"(%49, %9) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
%51 = "arith.divf"(%50, %10) {fastmath = #arith.fastmath<none>} : (vector<1xf32>, vector<1xf32>) -> vector<1xf32> | |
%52 = "vector.extract"(%51) {position = [0]} : (vector<1xf32>) -> f32 | |
%53 = "vector.insert"(%52, %11) {position = [0]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%54 = "vector.insert"(%52, %53) {position = [1]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%55 = "vector.insert"(%52, %54) {position = [2]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%56 = "vector.insert"(%52, %55) {position = [3]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%57 = "arith.subf"(%41, %56) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%58 = "arith.cmpi"(%17, %6) {predicate = 2 : i64} : (index, index) -> i1 | |
%59 = "arith.subi"(%1, %17) : (index, index) -> index | |
%60 = "arith.select"(%58, %59, %17) : (i1, index, index) -> index | |
%61 = "arith.divsi"(%60, %2) : (index, index) -> index | |
%62 = "arith.subi"(%1, %61) : (index, index) -> index | |
%63 = "arith.select"(%58, %62, %61) : (i1, index, index) -> index | |
%64 = "arith.addi"(%33, %63) : (index, index) -> index | |
"memref.store"(%57, %23, %64) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_11_generic_64x10x9216"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
"hal.executable_end"() : () -> () | |
}) {sym_name = "forward_dispatch_11", sym_visibility = "private"} : () -> () | |
%34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
^ | |
/home/prashant/stable.mlir:886:11: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
%36 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%35 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:886:11: note: see current operation: %51 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%36 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%35 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
^ | |
/home/prashant/stable.mlir:886:11: note: see existing live user here: %59 = "spirv.UConvert"(%51) : (i32) -> i64 | |
/home/prashant/stable.mlir:886:11: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
%36 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%35 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:886:11: note: see current operation: | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
%0 = "arith.constant"() {value = 64 : index} : () -> index | |
%1 = "arith.constant"() {value = 1 : index} : () -> index | |
"hal.return"(%0, %1, %1) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_12_generic_64x92160", translation_info = #iree_codegen.translation_info<SPIRVSubgroupReduce>, workgroup_size = [512 : index, 1 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = 16 : index} : () -> index | |
%1 = "arith.constant"() {value = 23040 : index} : () -> index | |
%2 = "arith.constant"() {value = -1 : index} : () -> index | |
%3 = "arith.constant"() {value = 4 : index} : () -> index | |
%4 = "arith.constant"() {value = 0 : index} : () -> index | |
%5 = "arith.constant"() {value = 64 : index} : () -> index | |
%6 = "arith.constant"() {value = 1474560 : index} : () -> index | |
%7 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
%8 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
%9 = "arith.constant"() {value = 32 : i32} : () -> i32 | |
%10 = "arith.constant"() {value = 1 : i32} : () -> i32 | |
%11 = "arith.constant"() {value = 2 : i32} : () -> i32 | |
%12 = "arith.constant"() {value = 4 : i32} : () -> i32 | |
%13 = "arith.constant"() {value = 8 : i32} : () -> i32 | |
%14 = "arith.constant"() {value = 16 : i32} : () -> i32 | |
%15 = "arith.constant"() {value = 32 : index} : () -> index | |
%16 = "arith.constant"() {value = 15 : index} : () -> index | |
%17 = "arith.constant"() {value = 0 : i32} : () -> i32 | |
%18 = "arith.constant"() {value = 2048 : index} : () -> index | |
%19 = "arith.constant"() {value = 92160 : index} : () -> index | |
%20 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
%21 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%22 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
%23 = "arith.index_castui"(%21) : (i32) -> index | |
%24 = "arith.index_castui"(%22) : (i32) -> index | |
%25 = "hal.interface.binding.subspan"(%23, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%26 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%27 = "hal.interface.binding.subspan"(%24, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%28 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%29 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%30 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%31 = "scf.for"(%4, %19, %18, %8) ({ | |
^bb0(%arg0: index, %arg1: vector<4xf32>): | |
%70 = "arith.cmpi"(%arg0, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%71 = "arith.subi"(%2, %arg0) : (index, index) -> index | |
%72 = "arith.select"(%70, %71, %arg0) : (i1, index, index) -> index | |
%73 = "arith.divsi"(%72, %3) : (index, index) -> index | |
%74 = "arith.subi"(%2, %73) : (index, index) -> index | |
%75 = "arith.select"(%70, %74, %73) : (i1, index, index) -> index | |
%76 = "arith.muli"(%30, %1) : (index, index) -> index | |
%77 = "arith.addi"(%20, %76) : (index, index) -> index | |
%78 = "arith.addi"(%75, %77) : (index, index) -> index | |
%79 = "arith.cmpi"(%23, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%80 = "arith.subi"(%2, %23) : (index, index) -> index | |
%81 = "arith.select"(%79, %80, %23) : (i1, index, index) -> index | |
%82 = "arith.divsi"(%81, %0) : (index, index) -> index | |
%83 = "arith.subi"(%2, %82) : (index, index) -> index | |
%84 = "arith.select"(%79, %83, %82) : (i1, index, index) -> index | |
%85 = "arith.addi"(%78, %84) : (index, index) -> index | |
%86 = "memref.load"(%26, %85) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%87 = "arith.mulf"(%86, %86) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%88 = "arith.addf"(%87, %arg1) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
"scf.yield"(%88) : (vector<4xf32>) -> () | |
}) : (index, index, index, vector<4xf32>) -> vector<4xf32> | |
%32 = "arith.cmpi"(%24, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%33 = "arith.subi"(%2, %24) : (index, index) -> index | |
%34 = "arith.select"(%32, %33, %24) : (i1, index, index) -> index | |
%35 = "arith.divsi"(%34, %3) : (index, index) -> index | |
%36 = "arith.subi"(%2, %35) : (index, index) -> index | |
%37 = "arith.select"(%32, %36, %35) : (i1, index, index) -> index | |
%38 = "arith.addi"(%30, %37) : (index, index) -> index | |
%39 = "memref.load"(%28, %38) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%40 = "vector.insert"(%39, %7) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
%41 = "vector.extractelement"(%40, %4) : (vector<1xf32>, index) -> f32 | |
%42 = "vector.reduction"(%31) {kind = #vector.kind<add>} : (vector<4xf32>) -> f32 | |
%43:2 = "gpu.shuffle"(%42, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%44 = "arith.addf"(%42, %43#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%45:2 = "gpu.shuffle"(%44, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%46 = "arith.addf"(%44, %45#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%47:2 = "gpu.shuffle"(%46, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%48 = "arith.addf"(%46, %47#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%49:2 = "gpu.shuffle"(%48, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%50 = "arith.addf"(%48, %49#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%51:2 = "gpu.shuffle"(%50, %14, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%52 = "arith.addf"(%50, %51#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%53 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<16xf32, #spirv.storage_class<Workgroup>> | |
%54 = "arith.divui"(%20, %15) : (index, index) -> index | |
%55 = "arith.remui"(%20, %15) : (index, index) -> index | |
%56 = "arith.cmpi"(%55, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
"scf.if"(%56) ({ | |
"memref.store"(%52, %53, %54) {nontemporal = false} : (f32, memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> () | |
"scf.yield"() : () -> () | |
}, { | |
}) : (i1) -> () | |
"gpu.barrier"() : () -> () | |
%57 = "arith.minui"(%55, %16) : (index, index) -> index | |
%58 = "memref.load"(%53, %57) {nontemporal = false} : (memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> f32 | |
%59:2 = "gpu.shuffle"(%58, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%60 = "arith.addf"(%58, %59#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%61:2 = "gpu.shuffle"(%60, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%62 = "arith.addf"(%60, %61#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%63:2 = "gpu.shuffle"(%62, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%64 = "arith.addf"(%62, %63#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%65:2 = "gpu.shuffle"(%64, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%66 = "arith.addf"(%64, %65#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%67:2 = "gpu.shuffle"(%66, %17, %9) {mode = #gpu<shuffle_mode idx>} : (f32, i32, i32) -> (f32, i1) | |
%68 = "arith.addf"(%67#0, %41) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%69 = "arith.cmpi"(%20, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
"scf.if"(%69) ({ | |
"memref.store"(%68, %29, %38) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"scf.yield"() : () -> () | |
}, { | |
}) : (i1) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [512, 1, 1]>, sym_name = "forward_dispatch_12_generic_64x92160"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
%36 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%35 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
^ | |
/home/prashant/stable.mlir:886:11: error: failed to serialize executables | |
%36 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%35 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:886:11: note: see current operation: | |
"hal.executable"() ({ | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
%0 = "arith.constant"() {value = 64 : index} : () -> index | |
%1 = "arith.constant"() {value = 1 : index} : () -> index | |
"hal.return"(%0, %1, %1) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_12_generic_64x92160", translation_info = #iree_codegen.translation_info<SPIRVSubgroupReduce>, workgroup_size = [512 : index, 1 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = 16 : index} : () -> index | |
%1 = "arith.constant"() {value = 23040 : index} : () -> index | |
%2 = "arith.constant"() {value = -1 : index} : () -> index | |
%3 = "arith.constant"() {value = 4 : index} : () -> index | |
%4 = "arith.constant"() {value = 0 : index} : () -> index | |
%5 = "arith.constant"() {value = 64 : index} : () -> index | |
%6 = "arith.constant"() {value = 1474560 : index} : () -> index | |
%7 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
%8 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
%9 = "arith.constant"() {value = 32 : i32} : () -> i32 | |
%10 = "arith.constant"() {value = 1 : i32} : () -> i32 | |
%11 = "arith.constant"() {value = 2 : i32} : () -> i32 | |
%12 = "arith.constant"() {value = 4 : i32} : () -> i32 | |
%13 = "arith.constant"() {value = 8 : i32} : () -> i32 | |
%14 = "arith.constant"() {value = 16 : i32} : () -> i32 | |
%15 = "arith.constant"() {value = 32 : index} : () -> index | |
%16 = "arith.constant"() {value = 15 : index} : () -> index | |
%17 = "arith.constant"() {value = 0 : i32} : () -> i32 | |
%18 = "arith.constant"() {value = 2048 : index} : () -> index | |
%19 = "arith.constant"() {value = 92160 : index} : () -> index | |
%20 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
%21 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%22 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
%23 = "arith.index_castui"(%21) : (i32) -> index | |
%24 = "arith.index_castui"(%22) : (i32) -> index | |
%25 = "hal.interface.binding.subspan"(%23, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%26 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%27 = "hal.interface.binding.subspan"(%24, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%28 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%29 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%30 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%31 = "scf.for"(%4, %19, %18, %8) ({ | |
^bb0(%arg0: index, %arg1: vector<4xf32>): | |
%70 = "arith.cmpi"(%arg0, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%71 = "arith.subi"(%2, %arg0) : (index, index) -> index | |
%72 = "arith.select"(%70, %71, %arg0) : (i1, index, index) -> index | |
%73 = "arith.divsi"(%72, %3) : (index, index) -> index | |
%74 = "arith.subi"(%2, %73) : (index, index) -> index | |
%75 = "arith.select"(%70, %74, %73) : (i1, index, index) -> index | |
%76 = "arith.muli"(%30, %1) : (index, index) -> index | |
%77 = "arith.addi"(%20, %76) : (index, index) -> index | |
%78 = "arith.addi"(%75, %77) : (index, index) -> index | |
%79 = "arith.cmpi"(%23, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%80 = "arith.subi"(%2, %23) : (index, index) -> index | |
%81 = "arith.select"(%79, %80, %23) : (i1, index, index) -> index | |
%82 = "arith.divsi"(%81, %0) : (index, index) -> index | |
%83 = "arith.subi"(%2, %82) : (index, index) -> index | |
%84 = "arith.select"(%79, %83, %82) : (i1, index, index) -> index | |
%85 = "arith.addi"(%78, %84) : (index, index) -> index | |
%86 = "memref.load"(%26, %85) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%87 = "arith.mulf"(%86, %86) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%88 = "arith.addf"(%87, %arg1) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
"scf.yield"(%88) : (vector<4xf32>) -> () | |
}) : (index, index, index, vector<4xf32>) -> vector<4xf32> | |
%32 = "arith.cmpi"(%24, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
%33 = "arith.subi"(%2, %24) : (index, index) -> index | |
%34 = "arith.select"(%32, %33, %24) : (i1, index, index) -> index | |
%35 = "arith.divsi"(%34, %3) : (index, index) -> index | |
%36 = "arith.subi"(%2, %35) : (index, index) -> index | |
%37 = "arith.select"(%32, %36, %35) : (i1, index, index) -> index | |
%38 = "arith.addi"(%30, %37) : (index, index) -> index | |
%39 = "memref.load"(%28, %38) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%40 = "vector.insert"(%39, %7) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
%41 = "vector.extractelement"(%40, %4) : (vector<1xf32>, index) -> f32 | |
%42 = "vector.reduction"(%31) {kind = #vector.kind<add>} : (vector<4xf32>) -> f32 | |
%43:2 = "gpu.shuffle"(%42, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%44 = "arith.addf"(%42, %43#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%45:2 = "gpu.shuffle"(%44, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%46 = "arith.addf"(%44, %45#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%47:2 = "gpu.shuffle"(%46, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%48 = "arith.addf"(%46, %47#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%49:2 = "gpu.shuffle"(%48, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%50 = "arith.addf"(%48, %49#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%51:2 = "gpu.shuffle"(%50, %14, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%52 = "arith.addf"(%50, %51#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%53 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<16xf32, #spirv.storage_class<Workgroup>> | |
%54 = "arith.divui"(%20, %15) : (index, index) -> index | |
%55 = "arith.remui"(%20, %15) : (index, index) -> index | |
%56 = "arith.cmpi"(%55, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
"scf.if"(%56) ({ | |
"memref.store"(%52, %53, %54) {nontemporal = false} : (f32, memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> () | |
"scf.yield"() : () -> () | |
}, { | |
}) : (i1) -> () | |
"gpu.barrier"() : () -> () | |
%57 = "arith.minui"(%55, %16) : (index, index) -> index | |
%58 = "memref.load"(%53, %57) {nontemporal = false} : (memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> f32 | |
%59:2 = "gpu.shuffle"(%58, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%60 = "arith.addf"(%58, %59#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%61:2 = "gpu.shuffle"(%60, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%62 = "arith.addf"(%60, %61#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%63:2 = "gpu.shuffle"(%62, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%64 = "arith.addf"(%62, %63#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%65:2 = "gpu.shuffle"(%64, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
%66 = "arith.addf"(%64, %65#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%67:2 = "gpu.shuffle"(%66, %17, %9) {mode = #gpu<shuffle_mode idx>} : (f32, i32, i32) -> (f32, i1) | |
%68 = "arith.addf"(%67#0, %41) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
%69 = "arith.cmpi"(%20, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
"scf.if"(%69) ({ | |
"memref.store"(%68, %29, %38) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"scf.yield"() : () -> () | |
}, { | |
}) : (i1) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [512, 1, 1]>, sym_name = "forward_dispatch_12_generic_64x92160"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
"hal.executable_end"() : () -> () | |
}) {sym_name = "forward_dispatch_12", sym_visibility = "private"} : () -> () | |
%36 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%35 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
^ | |
/home/prashant/stable.mlir:876:11: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
%34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:876:11: note: see current operation: %30 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
^ | |
/home/prashant/stable.mlir:876:11: note: see existing live user here: %38 = "spirv.UConvert"(%30) : (i32) -> i64 | |
/home/prashant/stable.mlir:930:11: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
%47 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%46, %44 : tensor<2x32x10x9216xf32>, tensor<2x32x1x1xf32>) outs(%45 : tensor<2x32x10x9216xf32>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:930:11: note: see current operation: | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%0 = "arith.constant"() {value = 72 : index} : () -> index | |
%1 = "arith.constant"() {value = 10 : index} : () -> index | |
%2 = "arith.constant"() {value = 64 : index} : () -> index | |
"hal.return"(%0, %1, %2) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_13_generic_64x10x9216", translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0__0", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = -1 : index} : () -> index | |
%1 = "arith.constant"() {value = 16 : index} : () -> index | |
%2 = "arith.constant"() {value = 2304 : index} : () -> index | |
%3 = "arith.constant"() {value = 23040 : index} : () -> index | |
%4 = "arith.constant"() {value = 32 : index} : () -> index | |
%5 = "arith.constant"() {value = 0 : index} : () -> index | |
%6 = "arith.constant"() {value = 64 : index} : () -> index | |
%7 = "arith.constant"() {value = 1474560 : index} : () -> index | |
%8 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
%9 = "arith.constant"() {value = dense<9.216000e+04> : vector<1xf32>} : () -> vector<1xf32> | |
%10 = "arith.constant"() {value = dense<9.99999974E-6> : vector<1xf32>} : () -> vector<1xf32> | |
%11 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
%12 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%13 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
%14 = "arith.index_castui"(%12) : (i32) -> index | |
%15 = "arith.index_castui"(%13) : (i32) -> index | |
%16 = "hal.interface.binding.subspan"(%14, %7) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%17 = "hal.interface.binding.subspan"(%5, %7) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%18 = "hal.interface.binding.subspan"(%5, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%19 = "hal.interface.binding.subspan"(%15, %7) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%20 = "hal.interface.binding.subspan"(%5, %7) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%21 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%22 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
%23 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
%24 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
%25 = "arith.muli"(%21, %4) : (index, index) -> index | |
%26 = "arith.addi"(%24, %25) : (index, index) -> index | |
%27 = "arith.muli"(%23, %3) : (index, index) -> index | |
%28 = "arith.addi"(%26, %27) : (index, index) -> index | |
%29 = "arith.muli"(%22, %2) : (index, index) -> index | |
%30 = "arith.addi"(%28, %29) : (index, index) -> index | |
%31 = "arith.cmpi"(%14, %5) {predicate = 2 : i64} : (index, index) -> i1 | |
%32 = "arith.subi"(%0, %14) : (index, index) -> index | |
%33 = "arith.select"(%31, %32, %14) : (i1, index, index) -> index | |
%34 = "arith.divsi"(%33, %1) : (index, index) -> index | |
%35 = "arith.subi"(%0, %34) : (index, index) -> index | |
%36 = "arith.select"(%31, %35, %34) : (i1, index, index) -> index | |
%37 = "arith.addi"(%30, %36) : (index, index) -> index | |
%38 = "memref.load"(%17, %37) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%39 = "memref.load"(%18, %23) {nontemporal = false} : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%40 = "vector.insert"(%39, %8) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
%41 = "arith.divf"(%40, %9) {fastmath = #arith.fastmath<none>} : (vector<1xf32>, vector<1xf32>) -> vector<1xf32> | |
%42 = "arith.addf"(%41, %10) {fastmath = #arith.fastmath<none>} : (vector<1xf32>, vector<1xf32>) -> vector<1xf32> | |
%43 = "math.rsqrt"(%42) {fastmath = #arith.fastmath<none>} : (vector<1xf32>) -> vector<1xf32> | |
%44 = "vector.extract"(%43) {position = [0]} : (vector<1xf32>) -> f32 | |
%45 = "vector.insert"(%44, %11) {position = [0]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%46 = "vector.insert"(%44, %45) {position = [1]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%47 = "vector.insert"(%44, %46) {position = [2]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%48 = "vector.insert"(%44, %47) {position = [3]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%49 = "arith.mulf"(%38, %48) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%50 = "arith.cmpi"(%15, %5) {predicate = 2 : i64} : (index, index) -> i1 | |
%51 = "arith.subi"(%0, %15) : (index, index) -> index | |
%52 = "arith.select"(%50, %51, %15) : (i1, index, index) -> index | |
%53 = "arith.divsi"(%52, %1) : (index, index) -> index | |
%54 = "arith.subi"(%0, %53) : (index, index) -> index | |
%55 = "arith.select"(%50, %54, %53) : (i1, index, index) -> index | |
%56 = "arith.addi"(%30, %55) : (index, index) -> index | |
"memref.store"(%49, %20, %56) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_13_generic_64x10x9216"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
%47 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%46, %44 : tensor<2x32x10x9216xf32>, tensor<2x32x1x1xf32>) outs(%45 : tensor<2x32x10x9216xf32>) { | |
^ | |
/home/prashant/stable.mlir:930:11: error: failed to serialize executables | |
%47 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%46, %44 : tensor<2x32x10x9216xf32>, tensor<2x32x1x1xf32>) outs(%45 : tensor<2x32x10x9216xf32>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:930:11: note: see current operation: | |
"hal.executable"() ({ | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%0 = "arith.constant"() {value = 72 : index} : () -> index | |
%1 = "arith.constant"() {value = 10 : index} : () -> index | |
%2 = "arith.constant"() {value = 64 : index} : () -> index | |
"hal.return"(%0, %1, %2) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_13_generic_64x10x9216", translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0__0", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = -1 : index} : () -> index | |
%1 = "arith.constant"() {value = 16 : index} : () -> index | |
%2 = "arith.constant"() {value = 2304 : index} : () -> index | |
%3 = "arith.constant"() {value = 23040 : index} : () -> index | |
%4 = "arith.constant"() {value = 32 : index} : () -> index | |
%5 = "arith.constant"() {value = 0 : index} : () -> index | |
%6 = "arith.constant"() {value = 64 : index} : () -> index | |
%7 = "arith.constant"() {value = 1474560 : index} : () -> index | |
%8 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
%9 = "arith.constant"() {value = dense<9.216000e+04> : vector<1xf32>} : () -> vector<1xf32> | |
%10 = "arith.constant"() {value = dense<9.99999974E-6> : vector<1xf32>} : () -> vector<1xf32> | |
%11 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
%12 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%13 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
%14 = "arith.index_castui"(%12) : (i32) -> index | |
%15 = "arith.index_castui"(%13) : (i32) -> index | |
%16 = "hal.interface.binding.subspan"(%14, %7) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%17 = "hal.interface.binding.subspan"(%5, %7) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%18 = "hal.interface.binding.subspan"(%5, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%19 = "hal.interface.binding.subspan"(%15, %7) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%20 = "hal.interface.binding.subspan"(%5, %7) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%21 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%22 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
%23 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
%24 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
%25 = "arith.muli"(%21, %4) : (index, index) -> index | |
%26 = "arith.addi"(%24, %25) : (index, index) -> index | |
%27 = "arith.muli"(%23, %3) : (index, index) -> index | |
%28 = "arith.addi"(%26, %27) : (index, index) -> index | |
%29 = "arith.muli"(%22, %2) : (index, index) -> index | |
%30 = "arith.addi"(%28, %29) : (index, index) -> index | |
%31 = "arith.cmpi"(%14, %5) {predicate = 2 : i64} : (index, index) -> i1 | |
%32 = "arith.subi"(%0, %14) : (index, index) -> index | |
%33 = "arith.select"(%31, %32, %14) : (i1, index, index) -> index | |
%34 = "arith.divsi"(%33, %1) : (index, index) -> index | |
%35 = "arith.subi"(%0, %34) : (index, index) -> index | |
%36 = "arith.select"(%31, %35, %34) : (i1, index, index) -> index | |
%37 = "arith.addi"(%30, %36) : (index, index) -> index | |
%38 = "memref.load"(%17, %37) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%39 = "memref.load"(%18, %23) {nontemporal = false} : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%40 = "vector.insert"(%39, %8) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
%41 = "arith.divf"(%40, %9) {fastmath = #arith.fastmath<none>} : (vector<1xf32>, vector<1xf32>) -> vector<1xf32> | |
%42 = "arith.addf"(%41, %10) {fastmath = #arith.fastmath<none>} : (vector<1xf32>, vector<1xf32>) -> vector<1xf32> | |
%43 = "math.rsqrt"(%42) {fastmath = #arith.fastmath<none>} : (vector<1xf32>) -> vector<1xf32> | |
%44 = "vector.extract"(%43) {position = [0]} : (vector<1xf32>) -> f32 | |
%45 = "vector.insert"(%44, %11) {position = [0]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%46 = "vector.insert"(%44, %45) {position = [1]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%47 = "vector.insert"(%44, %46) {position = [2]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%48 = "vector.insert"(%44, %47) {position = [3]} : (f32, vector<4xf32>) -> vector<4xf32> | |
%49 = "arith.mulf"(%38, %48) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%50 = "arith.cmpi"(%15, %5) {predicate = 2 : i64} : (index, index) -> i1 | |
%51 = "arith.subi"(%0, %15) : (index, index) -> index | |
%52 = "arith.select"(%50, %51, %15) : (i1, index, index) -> index | |
%53 = "arith.divsi"(%52, %1) : (index, index) -> index | |
%54 = "arith.subi"(%0, %53) : (index, index) -> index | |
%55 = "arith.select"(%50, %54, %53) : (i1, index, index) -> index | |
%56 = "arith.addi"(%30, %55) : (index, index) -> index | |
"memref.store"(%49, %20, %56) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_13_generic_64x10x9216"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
"hal.executable_end"() : () -> () | |
}) {sym_name = "forward_dispatch_13", sym_visibility = "private"} : () -> () | |
%47 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%46, %44 : tensor<2x32x10x9216xf32>, tensor<2x32x1x1xf32>) outs(%45 : tensor<2x32x10x9216xf32>) { | |
^ | |
/home/prashant/stable.mlir:936:21: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
%expanded_730 = tensor.expand_shape %collapsed_729 [[0], [1], [2, 3]] : tensor<2x320x9216xf32> into tensor<2x320x96x96xf32> | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:936:21: note: see current operation: %58 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%expanded_730 = tensor.expand_shape %collapsed_729 [[0], [1], [2, 3]] : tensor<2x320x9216xf32> into tensor<2x320x96x96xf32> | |
^ | |
/home/prashant/stable.mlir:936:21: note: see existing live user here: %80 = "spirv.UConvert"(%58) : (i32) -> i64 | |
/home/prashant/stable.mlir:957:11: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
%51 = linalg.generic {indexing_maps = [#map8, #map8, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %49 : tensor<2x320x96x96xf32>, tensor<2x320x96x96xf32>) outs(%25 : tensor<2x320x96x96xf32>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:957:11: note: see current operation: | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
%0 = "arith.constant"() {value = 3 : index} : () -> index | |
%1 = "arith.constant"() {value = 24 : index} : () -> index | |
%2 = "arith.constant"() {value = 640 : index} : () -> index | |
"hal.return"(%0, %1, %2) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_14_generic_2x320x96x96", translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [8 : index, 4 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 2 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_2_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = 4 : index} : () -> index | |
%1 = "arith.constant"() {value = -1 : index} : () -> index | |
%2 = "arith.constant"() {value = 16 : index} : () -> index | |
%3 = "arith.constant"() {value = 8 : index} : () -> index | |
%4 = "arith.constant"() {value = 24 : index} : () -> index | |
%5 = "arith.constant"() {value = 96 : index} : () -> index | |
%6 = "arith.constant"() {value = 2304 : index} : () -> index | |
%7 = "arith.constant"() {value = 737280 : index} : () -> index | |
%8 = "arith.constant"() {value = 0 : index} : () -> index | |
%9 = "arith.constant"() {value = 1474560 : index} : () -> index | |
%10 = "arith.constant"() {value = dense<0.693147182> : vector<4xf32>} : () -> vector<4xf32> | |
%11 = "arith.constant"() {value = dense<1.44269502> : vector<4xf32>} : () -> vector<4xf32> | |
%12 = "arith.constant"() {value = dense<0.499705136> : vector<4xf32>} : () -> vector<4xf32> | |
%13 = "arith.constant"() {value = dense<0.168738902> : vector<4xf32>} : () -> vector<4xf32> | |
%14 = "arith.constant"() {value = dense<0.0366896503> : vector<4xf32>} : () -> vector<4xf32> | |
%15 = "arith.constant"() {value = dense<1.314350e-02> : vector<4xf32>} : () -> vector<4xf32> | |
%16 = "arith.constant"() {value = dense<23> : vector<4xi32>} : () -> vector<4xi32> | |
%17 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
%18 = "arith.constant"() {value = dense<0x7F800000> : vector<4xf32>} : () -> vector<4xf32> | |
%19 = "arith.constant"() {value = dense<0xFF800000> : vector<4xf32>} : () -> vector<4xf32> | |
%20 = "arith.constant"() {value = dense<1.17549435E-38> : vector<4xf32>} : () -> vector<4xf32> | |
%21 = "arith.constant"() {value = dense<127> : vector<4xi32>} : () -> vector<4xi32> | |
%22 = "arith.constant"() {value = dense<-127> : vector<4xi32>} : () -> vector<4xi32> | |
%23 = "arith.constant"() {value = dense<1.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
%24 = "arith.constant"() {value = 320 : index} : () -> index | |
%25 = "arith.constant"() {value = 2 : index} : () -> index | |
%26 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%27 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
%28 = "hal.interface.constant.load"() {index = 2 : index} : () -> i32 | |
%29 = "hal.interface.constant.load"() {index = 3 : index} : () -> i32 | |
%30 = "arith.index_castui"(%26) : (i32) -> index | |
%31 = "arith.index_castui"(%27) : (i32) -> index | |
%32 = "arith.index_castui"(%28) : (i32) -> index | |
%33 = "arith.index_castui"(%29) : (i32) -> index | |
%34 = "hal.interface.binding.subspan"(%30, %9) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%35 = "hal.interface.binding.subspan"(%8, %9) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%36 = "hal.interface.binding.subspan"(%31, %24) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%37 = "hal.interface.binding.subspan"(%8, %24) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%38 = "hal.interface.binding.subspan"(%32, %24) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%39 = "hal.interface.binding.subspan"(%8, %24) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%40 = "hal.interface.binding.subspan"(%33, %9) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%41 = "hal.interface.binding.subspan"(%8, %9) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%42 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%43 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
%44 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
%45 = "arith.remui"(%44, %24) : (index, index) -> index | |
%46 = "arith.divui"(%44, %24) : (index, index) -> index | |
%47 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
%48 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
"scf.for"(%46, %25, %25) ({ | |
^bb0(%arg0: index): | |
"scf.for"(%45, %24, %24) ({ | |
^bb0(%arg1: index): | |
%49 = "arith.muli"(%arg0, %7) : (index, index) -> index | |
%50 = "arith.muli"(%arg1, %6) : (index, index) -> index | |
%51 = "arith.addi"(%49, %50) : (index, index) -> index | |
%52 = "arith.muli"(%43, %5) : (index, index) -> index | |
%53 = "arith.addi"(%51, %52) : (index, index) -> index | |
%54 = "arith.muli"(%47, %4) : (index, index) -> index | |
%55 = "arith.addi"(%53, %54) : (index, index) -> index | |
%56 = "arith.addi"(%55, %48) : (index, index) -> index | |
%57 = "arith.muli"(%42, %3) : (index, index) -> index | |
%58 = "arith.addi"(%56, %57) : (index, index) -> index | |
%59 = "arith.cmpi"(%30, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
%60 = "arith.subi"(%1, %30) : (index, index) -> index | |
%61 = "arith.select"(%59, %60, %30) : (i1, index, index) -> index | |
%62 = "arith.divsi"(%61, %2) : (index, index) -> index | |
%63 = "arith.subi"(%1, %62) : (index, index) -> index | |
%64 = "arith.select"(%59, %63, %62) : (i1, index, index) -> index | |
%65 = "arith.addi"(%58, %64) : (index, index) -> index | |
%66 = "memref.load"(%35, %65) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%67 = "arith.cmpi"(%31, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
%68 = "arith.subi"(%1, %31) : (index, index) -> index | |
%69 = "arith.select"(%67, %68, %31) : (i1, index, index) -> index | |
%70 = "arith.divsi"(%69, %0) : (index, index) -> index | |
%71 = "arith.subi"(%1, %70) : (index, index) -> index | |
%72 = "arith.select"(%67, %71, %70) : (i1, index, index) -> index | |
%73 = "arith.addi"(%arg1, %72) : (index, index) -> index | |
%74 = "memref.load"(%37, %73) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%75 = "arith.cmpi"(%32, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
%76 = "arith.subi"(%1, %32) : (index, index) -> index | |
%77 = "arith.select"(%75, %76, %32) : (i1, index, index) -> index | |
%78 = "arith.divsi"(%77, %0) : (index, index) -> index | |
%79 = "arith.subi"(%1, %78) : (index, index) -> index | |
%80 = "arith.select"(%75, %79, %78) : (i1, index, index) -> index | |
%81 = "arith.addi"(%arg1, %80) : (index, index) -> index | |
%82 = "memref.load"(%39, %81) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%83 = "vector.splat"(%74) : (f32) -> vector<4xf32> | |
%84 = "arith.mulf"(%66, %83) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%85 = "vector.splat"(%82) : (f32) -> vector<4xf32> | |
%86 = "arith.addf"(%84, %85) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%87 = "arith.negf"(%86) {fastmath = #arith.fastmath<none>} : (vector<4xf32>) -> vector<4xf32> | |
%88 = "arith.cmpf"(%87, %87) {predicate = 14 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
%89 = "arith.mulf"(%87, %11) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%90 = "math.floor"(%89) {fastmath = #arith.fastmath<none>} : (vector<4xf32>) -> vector<4xf32> | |
%91 = "arith.mulf"(%90, %10) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%92 = "arith.subf"(%87, %91) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%93 = "arith.mulf"(%92, %92) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%94 = "arith.mulf"(%93, %93) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%95 = "math.fma"(%23, %92, %23) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%96 = "math.fma"(%13, %92, %12) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%97 = "math.fma"(%15, %92, %14) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%98 = "math.fma"(%96, %93, %95) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%99 = "math.fma"(%97, %94, %98) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%100 = "arith.fptosi"(%90) : (vector<4xf32>) -> vector<4xi32> | |
%101 = "arith.addi"(%100, %21) : (vector<4xi32>, vector<4xi32>) -> vector<4xi32> | |
%102 = "arith.shli"(%101, %16) : (vector<4xi32>, vector<4xi32>) -> vector<4xi32> | |
%103 = "arith.bitcast"(%102) : (vector<4xi32>) -> vector<4xf32> | |
%104 = "arith.mulf"(%99, %103) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%105 = "arith.cmpi"(%100, %21) {predicate = 3 : i64} : (vector<4xi32>, vector<4xi32>) -> vector<4xi1> | |
%106 = "arith.cmpi"(%100, %22) {predicate = 5 : i64} : (vector<4xi32>, vector<4xi32>) -> vector<4xi1> | |
%107 = "arith.cmpf"(%87, %19) {predicate = 1 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
%108 = "arith.cmpf"(%87, %18) {predicate = 1 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
%109 = "arith.cmpf"(%87, %17) {predicate = 2 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
%110 = "arith.andi"(%105, %106) : (vector<4xi1>, vector<4xi1>) -> vector<4xi1> | |
%111 = "arith.select"(%109, %18, %20) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%112 = "arith.select"(%110, %104, %111) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%113 = "arith.select"(%108, %18, %112) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%114 = "arith.select"(%107, %17, %113) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%115 = "arith.select"(%88, %87, %114) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%116 = "arith.addf"(%115, %23) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%117 = "arith.divf"(%23, %116) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%118 = "arith.mulf"(%117, %86) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%119 = "arith.cmpi"(%33, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
%120 = "arith.subi"(%1, %33) : (index, index) -> index | |
%121 = "arith.select"(%119, %120, %33) : (i1, index, index) -> index | |
%122 = "arith.divsi"(%121, %2) : (index, index) -> index | |
%123 = "arith.subi"(%1, %122) : (index, index) -> index | |
%124 = "arith.select"(%119, %123, %122) : (i1, index, index) -> index | |
%125 = "arith.addi"(%58, %124) : (index, index) -> index | |
"memref.store"(%118, %41, %125) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"scf.yield"() : () -> () | |
}) : (index, index, index) -> () | |
"scf.yield"() : () -> () | |
}) : (index, index, index) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [8, 4, 1]>, sym_name = "forward_dispatch_14_generic_2x320x96x96"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
%51 = linalg.generic {indexing_maps = [#map8, #map8, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %49 : tensor<2x320x96x96xf32>, tensor<2x320x96x96xf32>) outs(%25 : tensor<2x320x96x96xf32>) { | |
^ | |
/home/prashant/stable.mlir:957:11: error: failed to serialize executables | |
%51 = linalg.generic {indexing_maps = [#map8, #map8, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %49 : tensor<2x320x96x96xf32>, tensor<2x320x96x96xf32>) outs(%25 : tensor<2x320x96x96xf32>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:957:11: note: see current operation: | |
"hal.executable"() ({ | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
%0 = "arith.constant"() {value = 3 : index} : () -> index | |
%1 = "arith.constant"() {value = 24 : index} : () -> index | |
%2 = "arith.constant"() {value = 640 : index} : () -> index | |
"hal.return"(%0, %1, %2) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_14_generic_2x320x96x96", translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [8 : index, 4 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 2 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_2_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = 4 : index} : () -> index | |
%1 = "arith.constant"() {value = -1 : index} : () -> index | |
%2 = "arith.constant"() {value = 16 : index} : () -> index | |
%3 = "arith.constant"() {value = 8 : index} : () -> index | |
%4 = "arith.constant"() {value = 24 : index} : () -> index | |
%5 = "arith.constant"() {value = 96 : index} : () -> index | |
%6 = "arith.constant"() {value = 2304 : index} : () -> index | |
%7 = "arith.constant"() {value = 737280 : index} : () -> index | |
%8 = "arith.constant"() {value = 0 : index} : () -> index | |
%9 = "arith.constant"() {value = 1474560 : index} : () -> index | |
%10 = "arith.constant"() {value = dense<0.693147182> : vector<4xf32>} : () -> vector<4xf32> | |
%11 = "arith.constant"() {value = dense<1.44269502> : vector<4xf32>} : () -> vector<4xf32> | |
%12 = "arith.constant"() {value = dense<0.499705136> : vector<4xf32>} : () -> vector<4xf32> | |
%13 = "arith.constant"() {value = dense<0.168738902> : vector<4xf32>} : () -> vector<4xf32> | |
%14 = "arith.constant"() {value = dense<0.0366896503> : vector<4xf32>} : () -> vector<4xf32> | |
%15 = "arith.constant"() {value = dense<1.314350e-02> : vector<4xf32>} : () -> vector<4xf32> | |
%16 = "arith.constant"() {value = dense<23> : vector<4xi32>} : () -> vector<4xi32> | |
%17 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
%18 = "arith.constant"() {value = dense<0x7F800000> : vector<4xf32>} : () -> vector<4xf32> | |
%19 = "arith.constant"() {value = dense<0xFF800000> : vector<4xf32>} : () -> vector<4xf32> | |
%20 = "arith.constant"() {value = dense<1.17549435E-38> : vector<4xf32>} : () -> vector<4xf32> | |
%21 = "arith.constant"() {value = dense<127> : vector<4xi32>} : () -> vector<4xi32> | |
%22 = "arith.constant"() {value = dense<-127> : vector<4xi32>} : () -> vector<4xi32> | |
%23 = "arith.constant"() {value = dense<1.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
%24 = "arith.constant"() {value = 320 : index} : () -> index | |
%25 = "arith.constant"() {value = 2 : index} : () -> index | |
%26 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%27 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
%28 = "hal.interface.constant.load"() {index = 2 : index} : () -> i32 | |
%29 = "hal.interface.constant.load"() {index = 3 : index} : () -> i32 | |
%30 = "arith.index_castui"(%26) : (i32) -> index | |
%31 = "arith.index_castui"(%27) : (i32) -> index | |
%32 = "arith.index_castui"(%28) : (i32) -> index | |
%33 = "arith.index_castui"(%29) : (i32) -> index | |
%34 = "hal.interface.binding.subspan"(%30, %9) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%35 = "hal.interface.binding.subspan"(%8, %9) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%36 = "hal.interface.binding.subspan"(%31, %24) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%37 = "hal.interface.binding.subspan"(%8, %24) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%38 = "hal.interface.binding.subspan"(%32, %24) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%39 = "hal.interface.binding.subspan"(%8, %24) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%40 = "hal.interface.binding.subspan"(%33, %9) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%41 = "hal.interface.binding.subspan"(%8, %9) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%42 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%43 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
%44 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
%45 = "arith.remui"(%44, %24) : (index, index) -> index | |
%46 = "arith.divui"(%44, %24) : (index, index) -> index | |
%47 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
%48 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
"scf.for"(%46, %25, %25) ({ | |
^bb0(%arg0: index): | |
"scf.for"(%45, %24, %24) ({ | |
^bb0(%arg1: index): | |
%49 = "arith.muli"(%arg0, %7) : (index, index) -> index | |
%50 = "arith.muli"(%arg1, %6) : (index, index) -> index | |
%51 = "arith.addi"(%49, %50) : (index, index) -> index | |
%52 = "arith.muli"(%43, %5) : (index, index) -> index | |
%53 = "arith.addi"(%51, %52) : (index, index) -> index | |
%54 = "arith.muli"(%47, %4) : (index, index) -> index | |
%55 = "arith.addi"(%53, %54) : (index, index) -> index | |
%56 = "arith.addi"(%55, %48) : (index, index) -> index | |
%57 = "arith.muli"(%42, %3) : (index, index) -> index | |
%58 = "arith.addi"(%56, %57) : (index, index) -> index | |
%59 = "arith.cmpi"(%30, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
%60 = "arith.subi"(%1, %30) : (index, index) -> index | |
%61 = "arith.select"(%59, %60, %30) : (i1, index, index) -> index | |
%62 = "arith.divsi"(%61, %2) : (index, index) -> index | |
%63 = "arith.subi"(%1, %62) : (index, index) -> index | |
%64 = "arith.select"(%59, %63, %62) : (i1, index, index) -> index | |
%65 = "arith.addi"(%58, %64) : (index, index) -> index | |
%66 = "memref.load"(%35, %65) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%67 = "arith.cmpi"(%31, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
%68 = "arith.subi"(%1, %31) : (index, index) -> index | |
%69 = "arith.select"(%67, %68, %31) : (i1, index, index) -> index | |
%70 = "arith.divsi"(%69, %0) : (index, index) -> index | |
%71 = "arith.subi"(%1, %70) : (index, index) -> index | |
%72 = "arith.select"(%67, %71, %70) : (i1, index, index) -> index | |
%73 = "arith.addi"(%arg1, %72) : (index, index) -> index | |
%74 = "memref.load"(%37, %73) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%75 = "arith.cmpi"(%32, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
%76 = "arith.subi"(%1, %32) : (index, index) -> index | |
%77 = "arith.select"(%75, %76, %32) : (i1, index, index) -> index | |
%78 = "arith.divsi"(%77, %0) : (index, index) -> index | |
%79 = "arith.subi"(%1, %78) : (index, index) -> index | |
%80 = "arith.select"(%75, %79, %78) : (i1, index, index) -> index | |
%81 = "arith.addi"(%arg1, %80) : (index, index) -> index | |
%82 = "memref.load"(%39, %81) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%83 = "vector.splat"(%74) : (f32) -> vector<4xf32> | |
%84 = "arith.mulf"(%66, %83) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%85 = "vector.splat"(%82) : (f32) -> vector<4xf32> | |
%86 = "arith.addf"(%84, %85) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%87 = "arith.negf"(%86) {fastmath = #arith.fastmath<none>} : (vector<4xf32>) -> vector<4xf32> | |
%88 = "arith.cmpf"(%87, %87) {predicate = 14 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
%89 = "arith.mulf"(%87, %11) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%90 = "math.floor"(%89) {fastmath = #arith.fastmath<none>} : (vector<4xf32>) -> vector<4xf32> | |
%91 = "arith.mulf"(%90, %10) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%92 = "arith.subf"(%87, %91) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%93 = "arith.mulf"(%92, %92) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%94 = "arith.mulf"(%93, %93) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%95 = "math.fma"(%23, %92, %23) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%96 = "math.fma"(%13, %92, %12) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%97 = "math.fma"(%15, %92, %14) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%98 = "math.fma"(%96, %93, %95) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%99 = "math.fma"(%97, %94, %98) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%100 = "arith.fptosi"(%90) : (vector<4xf32>) -> vector<4xi32> | |
%101 = "arith.addi"(%100, %21) : (vector<4xi32>, vector<4xi32>) -> vector<4xi32> | |
%102 = "arith.shli"(%101, %16) : (vector<4xi32>, vector<4xi32>) -> vector<4xi32> | |
%103 = "arith.bitcast"(%102) : (vector<4xi32>) -> vector<4xf32> | |
%104 = "arith.mulf"(%99, %103) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%105 = "arith.cmpi"(%100, %21) {predicate = 3 : i64} : (vector<4xi32>, vector<4xi32>) -> vector<4xi1> | |
%106 = "arith.cmpi"(%100, %22) {predicate = 5 : i64} : (vector<4xi32>, vector<4xi32>) -> vector<4xi1> | |
%107 = "arith.cmpf"(%87, %19) {predicate = 1 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
%108 = "arith.cmpf"(%87, %18) {predicate = 1 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
%109 = "arith.cmpf"(%87, %17) {predicate = 2 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
%110 = "arith.andi"(%105, %106) : (vector<4xi1>, vector<4xi1>) -> vector<4xi1> | |
%111 = "arith.select"(%109, %18, %20) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%112 = "arith.select"(%110, %104, %111) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%113 = "arith.select"(%108, %18, %112) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%114 = "arith.select"(%107, %17, %113) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%115 = "arith.select"(%88, %87, %114) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%116 = "arith.addf"(%115, %23) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%117 = "arith.divf"(%23, %116) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%118 = "arith.mulf"(%117, %86) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%119 = "arith.cmpi"(%33, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
%120 = "arith.subi"(%1, %33) : (index, index) -> index | |
%121 = "arith.select"(%119, %120, %33) : (i1, index, index) -> index | |
%122 = "arith.divsi"(%121, %2) : (index, index) -> index | |
%123 = "arith.subi"(%1, %122) : (index, index) -> index | |
%124 = "arith.select"(%119, %123, %122) : (i1, index, index) -> index | |
%125 = "arith.addi"(%58, %124) : (index, index) -> index | |
"memref.store"(%118, %41, %125) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"scf.yield"() : () -> () | |
}) : (index, index, index) -> () | |
"scf.yield"() : () -> () | |
}) : (index, index, index) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [8, 4, 1]>, sym_name = "forward_dispatch_14_generic_2x320x96x96"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
"hal.executable_end"() : () -> () | |
}) {sym_name = "forward_dispatch_14", sym_visibility = "private"} : () -> () | |
%51 = linalg.generic {indexing_maps = [#map8, #map8, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %49 : tensor<2x320x96x96xf32>, tensor<2x320x96x96xf32>) outs(%25 : tensor<2x320x96x96xf32>) { | |
^ | |
/home/prashant/stable.mlir:957:11: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
%51 = linalg.generic {indexing_maps = [#map8, #map8, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %49 : tensor<2x320x96x96xf32>, tensor<2x320x96x96xf32>) outs(%25 : tensor<2x320x96x96xf32>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:957:11: note: see current operation: %38 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%51 = linalg.generic {indexing_maps = [#map8, #map8, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %49 : tensor<2x320x96x96xf32>, tensor<2x320x96x96xf32>) outs(%25 : tensor<2x320x96x96xf32>) { | |
^ | |
/home/prashant/stable.mlir:957:11: note: see existing live user here: %39 = "spirv.UConvert"(%38) : (i32) -> i64 | |
/home/prashant/stable.mlir:962:19: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
%padded_733 = tensor.pad %51 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:962:19: note: see current operation: | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
%0 = "arith.constant"() {value = 3 : index} : () -> index | |
%1 = "arith.constant"() {value = 96 : index} : () -> index | |
%2 = "arith.constant"() {value = 640 : index} : () -> index | |
"hal.return"(%0, %1, %2) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 1, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_15", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = 17845699 : index} : () -> index | |
%1 = "arith.constant"() {value = 98 : index} : () -> index | |
%2 = "arith.constant"() {value = 9604 : index} : () -> index | |
%3 = "arith.constant"() {value = 3073280 : index} : () -> index | |
%4 = "arith.constant"() {value = -1 : index} : () -> index | |
%5 = "arith.constant"() {value = 4 : index} : () -> index | |
%6 = "arith.constant"() {value = 32 : index} : () -> index | |
%7 = "arith.constant"() {value = 96 : index} : () -> index | |
%8 = "arith.constant"() {value = 9216 : index} : () -> index | |
%9 = "arith.constant"() {value = 2949120 : index} : () -> index | |
%10 = "arith.constant"() {value = 0 : index} : () -> index | |
%11 = "arith.constant"() {value = 6146560 : index} : () -> index | |
%12 = "arith.constant"() {value = 5898240 : index} : () -> index | |
%13 = "arith.constant"() {value = 320 : index} : () -> index | |
%14 = "arith.constant"() {value = 2 : index} : () -> index | |
%15 = "arith.constant"() {value = 71382400 : index} : () -> index | |
%16 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%17 = "arith.index_castui"(%16) : (i32) -> index | |
%18 = "hal.interface.binding.subspan"(%17, %12) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%19 = "hal.interface.binding.subspan"(%10, %12) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%20 = "hal.interface.binding.subspan"(%15, %11) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%21 = "hal.interface.binding.subspan"(%10, %11) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%22 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%23 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
%24 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
%25 = "arith.remui"(%24, %13) : (index, index) -> index | |
%26 = "arith.divui"(%24, %13) : (index, index) -> index | |
%27 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
%28 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
%29 = "gpu.thread_id"() {dimension = #gpu<dim z>} : () -> index | |
"scf.for"(%26, %14, %14) ({ | |
^bb0(%arg0: index): | |
"scf.for"(%25, %13, %13) ({ | |
^bb0(%arg1: index): | |
%30 = "arith.muli"(%arg0, %9) : (index, index) -> index | |
%31 = "arith.muli"(%arg1, %8) : (index, index) -> index | |
%32 = "arith.addi"(%30, %31) : (index, index) -> index | |
%33 = "arith.muli"(%29, %8) : (index, index) -> index | |
%34 = "arith.addi"(%32, %33) : (index, index) -> index | |
%35 = "arith.muli"(%23, %7) : (index, index) -> index | |
%36 = "arith.addi"(%34, %35) : (index, index) -> index | |
%37 = "arith.muli"(%28, %7) : (index, index) -> index | |
%38 = "arith.addi"(%36, %37) : (index, index) -> index | |
%39 = "arith.muli"(%22, %6) : (index, index) -> index | |
%40 = "arith.addi"(%38, %39) : (index, index) -> index | |
%41 = "arith.addi"(%40, %27) : (index, index) -> index | |
%42 = "arith.cmpi"(%17, %10) {predicate = 2 : i64} : (index, index) -> i1 | |
%43 = "arith.subi"(%4, %17) : (index, index) -> index | |
%44 = "arith.select"(%42, %43, %17) : (i1, index, index) -> index | |
%45 = "arith.divsi"(%44, %5) : (index, index) -> index | |
%46 = "arith.subi"(%4, %45) : (index, index) -> index | |
%47 = "arith.select"(%42, %46, %45) : (i1, index, index) -> index | |
%48 = "arith.addi"(%41, %47) : (index, index) -> index | |
%49 = "memref.load"(%19, %48) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%50 = "arith.muli"(%arg0, %3) : (index, index) -> index | |
%51 = "arith.muli"(%arg1, %2) : (index, index) -> index | |
%52 = "arith.addi"(%50, %51) : (index, index) -> index | |
%53 = "arith.muli"(%29, %2) : (index, index) -> index | |
%54 = "arith.addi"(%52, %53) : (index, index) -> index | |
%55 = "arith.muli"(%23, %1) : (index, index) -> index | |
%56 = "arith.addi"(%54, %55) : (index, index) -> index | |
%57 = "arith.muli"(%28, %1) : (index, index) -> index | |
%58 = "arith.addi"(%56, %57) : (index, index) -> index | |
%59 = "arith.addi"(%58, %39) : (index, index) -> index | |
%60 = "arith.addi"(%59, %27) : (index, index) -> index | |
%61 = "arith.addi"(%60, %0) : (index, index) -> index | |
"memref.store"(%49, %21, %61) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"scf.yield"() : () -> () | |
}) : (index, index, index) -> () | |
"scf.yield"() : () -> () | |
}) : (index, index, index) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_15"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
%padded_733 = tensor.pad %51 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^ | |
/home/prashant/stable.mlir:962:19: error: failed to serialize executables | |
%padded_733 = tensor.pad %51 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:962:19: note: see current operation: | |
"hal.executable"() ({ | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
%0 = "arith.constant"() {value = 3 : index} : () -> index | |
%1 = "arith.constant"() {value = 96 : index} : () -> index | |
%2 = "arith.constant"() {value = 640 : index} : () -> index | |
"hal.return"(%0, %1, %2) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 1, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_15", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = 17845699 : index} : () -> index | |
%1 = "arith.constant"() {value = 98 : index} : () -> index | |
%2 = "arith.constant"() {value = 9604 : index} : () -> index | |
%3 = "arith.constant"() {value = 3073280 : index} : () -> index | |
%4 = "arith.constant"() {value = -1 : index} : () -> index | |
%5 = "arith.constant"() {value = 4 : index} : () -> index | |
%6 = "arith.constant"() {value = 32 : index} : () -> index | |
%7 = "arith.constant"() {value = 96 : index} : () -> index | |
%8 = "arith.constant"() {value = 9216 : index} : () -> index | |
%9 = "arith.constant"() {value = 2949120 : index} : () -> index | |
%10 = "arith.constant"() {value = 0 : index} : () -> index | |
%11 = "arith.constant"() {value = 6146560 : index} : () -> index | |
%12 = "arith.constant"() {value = 5898240 : index} : () -> index | |
%13 = "arith.constant"() {value = 320 : index} : () -> index | |
%14 = "arith.constant"() {value = 2 : index} : () -> index | |
%15 = "arith.constant"() {value = 71382400 : index} : () -> index | |
%16 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%17 = "arith.index_castui"(%16) : (i32) -> index | |
%18 = "hal.interface.binding.subspan"(%17, %12) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%19 = "hal.interface.binding.subspan"(%10, %12) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%20 = "hal.interface.binding.subspan"(%15, %11) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%21 = "hal.interface.binding.subspan"(%10, %11) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
%22 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%23 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
%24 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
%25 = "arith.remui"(%24, %13) : (index, index) -> index | |
%26 = "arith.divui"(%24, %13) : (index, index) -> index | |
%27 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
%28 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
%29 = "gpu.thread_id"() {dimension = #gpu<dim z>} : () -> index | |
"scf.for"(%26, %14, %14) ({ | |
^bb0(%arg0: index): | |
"scf.for"(%25, %13, %13) ({ | |
^bb0(%arg1: index): | |
%30 = "arith.muli"(%arg0, %9) : (index, index) -> index | |
%31 = "arith.muli"(%arg1, %8) : (index, index) -> index | |
%32 = "arith.addi"(%30, %31) : (index, index) -> index | |
%33 = "arith.muli"(%29, %8) : (index, index) -> index | |
%34 = "arith.addi"(%32, %33) : (index, index) -> index | |
%35 = "arith.muli"(%23, %7) : (index, index) -> index | |
%36 = "arith.addi"(%34, %35) : (index, index) -> index | |
%37 = "arith.muli"(%28, %7) : (index, index) -> index | |
%38 = "arith.addi"(%36, %37) : (index, index) -> index | |
%39 = "arith.muli"(%22, %6) : (index, index) -> index | |
%40 = "arith.addi"(%38, %39) : (index, index) -> index | |
%41 = "arith.addi"(%40, %27) : (index, index) -> index | |
%42 = "arith.cmpi"(%17, %10) {predicate = 2 : i64} : (index, index) -> i1 | |
%43 = "arith.subi"(%4, %17) : (index, index) -> index | |
%44 = "arith.select"(%42, %43, %17) : (i1, index, index) -> index | |
%45 = "arith.divsi"(%44, %5) : (index, index) -> index | |
%46 = "arith.subi"(%4, %45) : (index, index) -> index | |
%47 = "arith.select"(%42, %46, %45) : (i1, index, index) -> index | |
%48 = "arith.addi"(%41, %47) : (index, index) -> index | |
%49 = "memref.load"(%19, %48) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
%50 = "arith.muli"(%arg0, %3) : (index, index) -> index | |
%51 = "arith.muli"(%arg1, %2) : (index, index) -> index | |
%52 = "arith.addi"(%50, %51) : (index, index) -> index | |
%53 = "arith.muli"(%29, %2) : (index, index) -> index | |
%54 = "arith.addi"(%52, %53) : (index, index) -> index | |
%55 = "arith.muli"(%23, %1) : (index, index) -> index | |
%56 = "arith.addi"(%54, %55) : (index, index) -> index | |
%57 = "arith.muli"(%28, %1) : (index, index) -> index | |
%58 = "arith.addi"(%56, %57) : (index, index) -> index | |
%59 = "arith.addi"(%58, %39) : (index, index) -> index | |
%60 = "arith.addi"(%59, %27) : (index, index) -> index | |
%61 = "arith.addi"(%60, %0) : (index, index) -> index | |
"memref.store"(%49, %21, %61) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"scf.yield"() : () -> () | |
}) : (index, index, index) -> () | |
"scf.yield"() : () -> () | |
}) : (index, index, index) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_15"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
"hal.executable_end"() : () -> () | |
}) {sym_name = "forward_dispatch_15", sym_visibility = "private"} : () -> () | |
%padded_733 = tensor.pad %51 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
^ | |
/home/prashant/stable.mlir:1313:12: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
%130 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel"]} ins(%cst_662 : tensor<320x320xf32>) outs(%101 : tensor<320x320xf32>) { | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:1313:12: note: see current operation: %187 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%130 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel"]} ins(%cst_662 : tensor<320x320xf32>) outs(%101 : tensor<320x320xf32>) { | |
^ | |
/home/prashant/stable.mlir:1313:12: note: see existing live user here: %195 = "spirv.UConvert"(%187) : (i32) -> i64 | |
/home/prashant/stable.mlir:1320:12: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
%133 = linalg.matmul ins(%collapsed_749, %130 : tensor<18432x320xf32>, tensor<320x320xf32>) outs(%132 : tensor<18432x320xf32>) -> tensor<18432x320xf32> | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:1320:12: note: see current operation: | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%0 = "arith.constant"() {value = 5 : index} : () -> index | |
%1 = "arith.constant"() {value = 288 : index} : () -> index | |
%2 = "arith.constant"() {value = 1 : index} : () -> index | |
"hal.return"(%0, %1, %2) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_35_matmul_18432x320x320", translation_info = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize pipeline_depth = 1>, workgroup_size = [16 : index, 16 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 2 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_2_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = -33 : index} : () -> index | |
%1 = "arith.constant"() {value = 527 : index} : () -> index | |
%2 = "arith.constant"() {value = 510 : index} : () -> index | |
%3 = "arith.constant"() {value = 493 : index} : () -> index | |
%4 = "arith.constant"() {value = 476 : index} : () -> index | |
%5 = "arith.constant"() {value = 459 : index} : () -> index | |
%6 = "arith.constant"() {value = 442 : index} : () -> index | |
%7 = "arith.constant"() {value = 425 : index} : () -> index | |
%8 = "arith.constant"() {value = 408 : index} : () -> index | |
%9 = "arith.constant"() {value = 391 : index} : () -> index | |
%10 = "arith.constant"() {value = 374 : index} : () -> index | |
%11 = "arith.constant"() {value = 357 : index} : () -> index | |
%12 = "arith.constant"() {value = 340 : index} : () -> index | |
%13 = "arith.constant"() {value = 323 : index} : () -> index | |
%14 = "arith.constant"() {value = 306 : index} : () -> index | |
%15 = "arith.constant"() {value = 289 : index} : () -> index | |
%16 = "arith.constant"() {value = 255 : index} : () -> index | |
%17 = "arith.constant"() {value = 238 : index} : () -> index | |
%18 = "arith.constant"() {value = 221 : index} : () -> index | |
%19 = "arith.constant"() {value = 204 : index} : () -> index | |
%20 = "arith.constant"() {value = 187 : index} : () -> index | |
%21 = "arith.constant"() {value = 170 : index} : () -> index | |
%22 = "arith.constant"() {value = 153 : index} : () -> index | |
%23 = "arith.constant"() {value = 136 : index} : () -> index | |
%24 = "arith.constant"() {value = 119 : index} : () -> index | |
%25 = "arith.constant"() {value = 102 : index} : () -> index | |
%26 = "arith.constant"() {value = 85 : index} : () -> index | |
%27 = "arith.constant"() {value = 68 : index} : () -> index | |
%28 = "arith.constant"() {value = 51 : index} : () -> index | |
%29 = "arith.constant"() {value = 34 : index} : () -> index | |
%30 = "arith.constant"() {value = 33 : index} : () -> index | |
%31 = "arith.constant"() {value = 31 : index} : () -> index | |
%32 = "arith.constant"() {value = 30 : index} : () -> index | |
%33 = "arith.constant"() {value = 29 : index} : () -> index | |
%34 = "arith.constant"() {value = 28 : index} : () -> index | |
%35 = "arith.constant"() {value = 27 : index} : () -> index | |
%36 = "arith.constant"() {value = 25 : index} : () -> index | |
%37 = "arith.constant"() {value = 24 : index} : () -> index | |
%38 = "arith.constant"() {value = 23 : index} : () -> index | |
%39 = "arith.constant"() {value = 22 : index} : () -> index | |
%40 = "arith.constant"() {value = 21 : index} : () -> index | |
%41 = "arith.constant"() {value = 20 : index} : () -> index | |
%42 = "arith.constant"() {value = 19 : index} : () -> index | |
%43 = "arith.constant"() {value = 15 : index} : () -> index | |
%44 = "arith.constant"() {value = 14 : index} : () -> index | |
%45 = "arith.constant"() {value = 13 : index} : () -> index | |
%46 = "arith.constant"() {value = 12 : index} : () -> index | |
%47 = "arith.constant"() {value = 11 : index} : () -> index | |
%48 = "arith.constant"() {value = 10 : index} : () -> index | |
%49 = "arith.constant"() {value = 9 : index} : () -> index | |
%50 = "arith.constant"() {value = 7 : index} : () -> index | |
%51 = "arith.constant"() {value = 6 : index} : () -> index | |
%52 = "arith.constant"() {value = 5 : index} : () -> index | |
%53 = "arith.constant"() {value = 4 : index} : () -> index | |
%54 = "arith.constant"() {value = 3 : index} : () -> index | |
%55 = "arith.constant"() {value = 2 : index} : () -> index | |
%56 = "arith.constant"() {value = 1 : index} : () -> index | |
%57 = "arith.constant"() {value = 36 : index} : () -> index | |
%58 = "arith.constant"() {value = 272 : index} : () -> index | |
%59 = "arith.constant"() {value = 17 : index} : () -> index | |
%60 = "arith.constant"() {value = 18 : index} : () -> index | |
%61 = "arith.constant"() {value = 64 : index} : () -> index | |
%62 = "arith.constant"() {value = 1280 : index} : () -> index | |
%63 = "arith.constant"() {value = 1477120 : index} : () -> index | |
%64 = "arith.constant"() {value = 72 : index} : () -> index | |
%65 = "arith.constant"() {value = 8 : index} : () -> index | |
%66 = "arith.constant"() {value = 2560 : index} : () -> index | |
%67 = "arith.constant"() {value = 240 : index} : () -> index | |
%68 = "arith.constant"() {value = 160 : index} : () -> index | |
%69 = "arith.constant"() {value = 80 : index} : () -> index | |
%70 = "arith.constant"() {value = -1 : index} : () -> index | |
%71 = "arith.constant"() {value = 16 : index} : () -> index | |
%72 = "arith.constant"() {value = 320 : index} : () -> index | |
%73 = "arith.constant"() {value = 5120 : index} : () -> index | |
%74 = "arith.constant"() {value = 0 : index} : () -> index | |
%75 = "arith.constant"() {value = 1474560 : index} : () -> index | |
%76 = "arith.constant"() {value = 25600 : index} : () -> index | |
%77 = "arith.constant"() {value = 288 : index} : () -> index | |
%78 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
%79 = "arith.constant"() {value = 23592960 : index} : () -> index | |
%80 = "arith.constant"() {value = 32 : index} : () -> index | |
%81 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
%82 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
%83 = "gpu.thread_id"() {dimension = #gpu<dim z>} : () -> index | |
%84 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>> | |
%85 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>> | |
%86 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%87 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
%88 = "arith.index_castui"(%86) : (i32) -> index | |
%89 = "arith.index_castui"(%87) : (i32) -> index | |
%90 = "hal.interface.binding.subspan"(%79, %75) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%91 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%92 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%93 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%94 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%95 = "hal.interface.binding.subspan"(%88, %76) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%96 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%97 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%98 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%99 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%100 = "hal.interface.binding.subspan"(%89, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%101 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%102 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%103 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%104 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%105 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%106 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%107 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%108 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%109 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%110 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%111 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%112 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%113 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%114 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
%115 = "arith.muli"(%114, %73) : (index, index) -> index | |
%116 = "arith.muli"(%82, %72) : (index, index) -> index | |
%117 = "arith.addi"(%115, %116) : (index, index) -> index | |
%118 = "arith.muli"(%113, %71) : (index, index) -> index | |
%119 = "arith.addi"(%117, %118) : (index, index) -> index | |
%120 = "arith.addi"(%119, %81) : (index, index) -> index | |
%121 = "arith.cmpi"(%89, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
%122 = "arith.subi"(%70, %89) : (index, index) -> index | |
%123 = "arith.select"(%121, %122, %89) : (i1, index, index) -> index | |
%124 = "arith.divsi"(%123, %71) : (index, index) -> index | |
%125 = "arith.subi"(%70, %124) : (index, index) -> index | |
%126 = "arith.select"(%121, %125, %124) : (i1, index, index) -> index | |
%127 = "arith.addi"(%120, %126) : (index, index) -> index | |
"memref.store"(%78, %101, %127) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
%128 = "arith.addi"(%127, %69) : (index, index) -> index | |
"memref.store"(%78, %102, %128) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
%129 = "arith.addi"(%127, %68) : (index, index) -> index | |
"memref.store"(%78, %103, %129) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
%130 = "arith.addi"(%127, %67) : (index, index) -> index | |
"memref.store"(%78, %104, %130) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
%131 = "memref.load"(%105, %127) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%132 = "memref.load"(%106, %128) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%133 = "memref.load"(%107, %129) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%134 = "memref.load"(%108, %130) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%135 = "arith.addi"(%115, %81) : (index, index) -> index | |
%136 = "arith.muli"(%82, %68) : (index, index) -> index | |
%137 = "arith.addi"(%135, %136) : (index, index) -> index | |
%138 = "arith.muli"(%83, %66) : (index, index) -> index | |
%139 = "arith.addi"(%137, %138) : (index, index) -> index | |
%140 = "arith.cmpi"(%81, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
%141 = "arith.subi"(%70, %81) : (index, index) -> index | |
%142 = "arith.select"(%140, %141, %81) : (i1, index, index) -> index | |
%143 = "arith.divsi"(%142, %65) : (index, index) -> index | |
%144 = "arith.subi"(%70, %143) : (index, index) -> index | |
%145 = "arith.select"(%140, %144, %143) : (i1, index, index) -> index | |
%146 = "arith.muli"(%145, %64) : (index, index) -> index | |
%147 = "arith.addi"(%139, %146) : (index, index) -> index | |
%148 = "arith.addi"(%147, %75) : (index, index) -> index | |
%149 = "memref.load"(%91, %148) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%150 = "arith.addi"(%147, %63) : (index, index) -> index | |
%151 = "memref.load"(%92, %150) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%152 = "arith.muli"(%82, %69) : (index, index) -> index | |
%153 = "arith.addi"(%81, %152) : (index, index) -> index | |
%154 = "arith.muli"(%83, %62) : (index, index) -> index | |
%155 = "arith.addi"(%153, %154) : (index, index) -> index | |
%156 = "arith.addi"(%155, %118) : (index, index) -> index | |
%157 = "arith.divsi"(%142, %71) : (index, index) -> index | |
%158 = "arith.subi"(%70, %157) : (index, index) -> index | |
%159 = "arith.select"(%140, %158, %157) : (i1, index, index) -> index | |
%160 = "arith.muli"(%159, %61) : (index, index) -> index | |
%161 = "arith.addi"(%156, %160) : (index, index) -> index | |
%162 = "arith.cmpi"(%88, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
%163 = "arith.subi"(%70, %88) : (index, index) -> index | |
%164 = "arith.select"(%162, %163, %88) : (i1, index, index) -> index | |
%165 = "arith.divsi"(%164, %71) : (index, index) -> index | |
%166 = "arith.subi"(%70, %165) : (index, index) -> index | |
%167 = "arith.select"(%162, %166, %165) : (i1, index, index) -> index | |
%168 = "arith.addi"(%161, %167) : (index, index) -> index | |
%169 = "memref.load"(%96, %168) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%170 = "arith.addi"(%168, %62) : (index, index) -> index | |
%171 = "memref.load"(%97, %170) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%172:8 = "scf.for"(%74, %77, %80, %131, %132, %133, %134, %149, %151, %169, %171) ({ | |
^bb0(%arg0: index, %arg1: vector<4xf32>, %arg2: vector<4xf32>, %arg3: vector<4xf32>, %arg4: vector<4xf32>, %arg5: vector<4xf32>, %arg6: vector<4xf32>, %arg7: vector<4xf32>, %arg8: vector<4xf32>): | |
"gpu.barrier"() : () -> () | |
%696 = "arith.muli"(%82, %60) : (index, index) -> index | |
%697 = "arith.addi"(%81, %696) : (index, index) -> index | |
%698 = "arith.muli"(%83, %77) : (index, index) -> index | |
%699 = "arith.addi"(%697, %698) : (index, index) -> index | |
%700 = "arith.addi"(%699, %145) : (index, index) -> index | |
"memref.store"(%arg5, %84, %700) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
%701 = "arith.addi"(%700, %77) : (index, index) -> index | |
"memref.store"(%arg6, %84, %701) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
%702 = "arith.muli"(%82, %59) : (index, index) -> index | |
%703 = "arith.addi"(%81, %702) : (index, index) -> index | |
%704 = "arith.muli"(%83, %58) : (index, index) -> index | |
%705 = "arith.addi"(%703, %704) : (index, index) -> index | |
%706 = "arith.addi"(%705, %159) : (index, index) -> index | |
"memref.store"(%arg7, %85, %706) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
%707 = "arith.addi"(%706, %58) : (index, index) -> index | |
"memref.store"(%arg8, %85, %707) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
"gpu.barrier"() : () -> () | |
%708 = "arith.muli"(%82, %57) : (index, index) -> index | |
%709 = "memref.load"(%84, %708) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%710 = "arith.addi"(%708, %56) : (index, index) -> index | |
%711 = "memref.load"(%84, %710) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%712 = "arith.addi"(%708, %55) : (index, index) -> index | |
%713 = "memref.load"(%84, %712) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%714 = "arith.addi"(%708, %54) : (index, index) -> index | |
%715 = "memref.load"(%84, %714) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%716 = "arith.addi"(%708, %53) : (index, index) -> index | |
%717 = "memref.load"(%84, %716) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%718 = "arith.addi"(%708, %52) : (index, index) -> index | |
%719 = "memref.load"(%84, %718) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%720 = "arith.addi"(%708, %51) : (index, index) -> index | |
%721 = "memref.load"(%84, %720) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%722 = "arith.addi"(%708, %50) : (index, index) -> index | |
%723 = "memref.load"(%84, %722) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%724 = "arith.addi"(%708, %49) : (index, index) -> index | |
%725 = "memref.load"(%84, %724) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%726 = "arith.addi"(%708, %48) : (index, index) -> index | |
%727 = "memref.load"(%84, %726) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%728 = "arith.addi"(%708, %47) : (index, index) -> index | |
%729 = "memref.load"(%84, %728) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%730 = "arith.addi"(%708, %46) : (index, index) -> index | |
%731 = "memref.load"(%84, %730) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%732 = "arith.addi"(%708, %45) : (index, index) -> index | |
%733 = "memref.load"(%84, %732) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%734 = "arith.addi"(%708, %44) : (index, index) -> index | |
%735 = "memref.load"(%84, %734) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%736 = "arith.addi"(%708, %43) : (index, index) -> index | |
%737 = "memref.load"(%84, %736) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%738 = "arith.addi"(%708, %71) : (index, index) -> index | |
%739 = "memref.load"(%84, %738) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%740 = "arith.addi"(%708, %60) : (index, index) -> index | |
%741 = "memref.load"(%84, %740) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%742 = "arith.addi"(%708, %42) : (index, index) -> index | |
%743 = "memref.load"(%84, %742) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%744 = "arith.addi"(%708, %41) : (index, index) -> index | |
%745 = "memref.load"(%84, %744) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%746 = "arith.addi"(%708, %40) : (index, index) -> index | |
%747 = "memref.load"(%84, %746) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%748 = "arith.addi"(%708, %39) : (index, index) -> index | |
%749 = "memref.load"(%84, %748) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%750 = "arith.addi"(%708, %38) : (index, index) -> index | |
%751 = "memref.load"(%84, %750) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%752 = "arith.addi"(%708, %37) : (index, index) -> index | |
%753 = "memref.load"(%84, %752) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%754 = "arith.addi"(%708, %36) : (index, index) -> index | |
%755 = "memref.load"(%84, %754) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%756 = "arith.addi"(%708, %35) : (index, index) -> index | |
%757 = "memref.load"(%84, %756) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%758 = "arith.addi"(%708, %34) : (index, index) -> index | |
%759 = "memref.load"(%84, %758) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%760 = "arith.addi"(%708, %33) : (index, index) -> index | |
%761 = "memref.load"(%84, %760) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%762 = "arith.addi"(%708, %32) : (index, index) -> index | |
%763 = "memref.load"(%84, %762) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%764 = "arith.addi"(%708, %31) : (index, index) -> index | |
%765 = "memref.load"(%84, %764) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%766 = "arith.addi"(%708, %80) : (index, index) -> index | |
%767 = "memref.load"(%84, %766) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%768 = "arith.addi"(%708, %30) : (index, index) -> index | |
%769 = "memref.load"(%84, %768) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%770 = "arith.addi"(%708, %29) : (index, index) -> index | |
%771 = "memref.load"(%84, %770) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%772 = "memref.load"(%85, %81) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%773 = "arith.addi"(%81, %59) : (index, index) -> index | |
%774 = "memref.load"(%85, %773) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%775 = "arith.addi"(%81, %29) : (index, index) -> index | |
%776 = "memref.load"(%85, %775) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%777 = "arith.addi"(%81, %28) : (index, index) -> index | |
%778 = "memref.load"(%85, %777) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%779 = "arith.addi"(%81, %27) : (index, index) -> index | |
%780 = "memref.load"(%85, %779) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%781 = "arith.addi"(%81, %26) : (index, index) -> index | |
%782 = "memref.load"(%85, %781) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%783 = "arith.addi"(%81, %25) : (index, index) -> index | |
%784 = "memref.load"(%85, %783) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%785 = "arith.addi"(%81, %24) : (index, index) -> index | |
%786 = "memref.load"(%85, %785) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%787 = "arith.addi"(%81, %23) : (index, index) -> index | |
%788 = "memref.load"(%85, %787) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%789 = "arith.addi"(%81, %22) : (index, index) -> index | |
%790 = "memref.load"(%85, %789) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%791 = "arith.addi"(%81, %21) : (index, index) -> index | |
%792 = "memref.load"(%85, %791) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%793 = "arith.addi"(%81, %20) : (index, index) -> index | |
%794 = "memref.load"(%85, %793) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%795 = "arith.addi"(%81, %19) : (index, index) -> index | |
%796 = "memref.load"(%85, %795) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%797 = "arith.addi"(%81, %18) : (index, index) -> index | |
%798 = "memref.load"(%85, %797) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%799 = "arith.addi"(%81, %17) : (index, index) -> index | |
%800 = "memref.load"(%85, %799) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%801 = "arith.addi"(%81, %16) : (index, index) -> index | |
%802 = "memref.load"(%85, %801) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%803 = "arith.addi"(%81, %58) : (index, index) -> index | |
%804 = "memref.load"(%85, %803) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%805 = "arith.addi"(%81, %15) : (index, index) -> index | |
%806 = "memref.load"(%85, %805) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%807 = "arith.addi"(%81, %14) : (index, index) -> index | |
%808 = "memref.load"(%85, %807) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%809 = "arith.addi"(%81, %13) : (index, index) -> index | |
%810 = "memref.load"(%85, %809) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%811 = "arith.addi"(%81, %12) : (index, index) -> index | |
%812 = "memref.load"(%85, %811) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%813 = "arith.addi"(%81, %11) : (index, index) -> index | |
%814 = "memref.load"(%85, %813) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%815 = "arith.addi"(%81, %10) : (index, index) -> index | |
%816 = "memref.load"(%85, %815) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%817 = "arith.addi"(%81, %9) : (index, index) -> index | |
%818 = "memref.load"(%85, %817) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%819 = "arith.addi"(%81, %8) : (index, index) -> index | |
%820 = "memref.load"(%85, %819) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%821 = "arith.addi"(%81, %7) : (index, index) -> index | |
%822 = "memref.load"(%85, %821) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%823 = "arith.addi"(%81, %6) : (index, index) -> index | |
%824 = "memref.load"(%85, %823) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%825 = "arith.addi"(%81, %5) : (index, index) -> index | |
%826 = "memref.load"(%85, %825) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%827 = "arith.addi"(%81, %4) : (index, index) -> index | |
%828 = "memref.load"(%85, %827) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%829 = "arith.addi"(%81, %3) : (index, index) -> index | |
%830 = "memref.load"(%85, %829) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%831 = "arith.addi"(%81, %2) : (index, index) -> index | |
%832 = "memref.load"(%85, %831) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%833 = "arith.addi"(%81, %1) : (index, index) -> index | |
%834 = "memref.load"(%85, %833) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%835 = "vector.extract"(%709) {position = [0]} : (vector<4xf32>) -> f32 | |
%836 = "vector.splat"(%835) : (f32) -> vector<4xf32> | |
%837 = "vector.fma"(%836, %772, %arg1) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%838 = "vector.extract"(%709) {position = [1]} : (vector<4xf32>) -> f32 | |
%839 = "vector.splat"(%838) : (f32) -> vector<4xf32> | |
%840 = "vector.fma"(%839, %774, %837) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%841 = "vector.extract"(%709) {position = [2]} : (vector<4xf32>) -> f32 | |
%842 = "vector.splat"(%841) : (f32) -> vector<4xf32> | |
%843 = "vector.fma"(%842, %776, %840) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%844 = "vector.extract"(%709) {position = [3]} : (vector<4xf32>) -> f32 | |
%845 = "vector.splat"(%844) : (f32) -> vector<4xf32> | |
%846 = "vector.fma"(%845, %778, %843) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%847 = "vector.extract"(%711) {position = [0]} : (vector<4xf32>) -> f32 | |
%848 = "vector.splat"(%847) : (f32) -> vector<4xf32> | |
%849 = "vector.fma"(%848, %780, %846) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%850 = "vector.extract"(%711) {position = [1]} : (vector<4xf32>) -> f32 | |
%851 = "vector.splat"(%850) : (f32) -> vector<4xf32> | |
%852 = "vector.fma"(%851, %782, %849) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%853 = "vector.extract"(%711) {position = [2]} : (vector<4xf32>) -> f32 | |
%854 = "vector.splat"(%853) : (f32) -> vector<4xf32> | |
%855 = "vector.fma"(%854, %784, %852) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%856 = "vector.extract"(%711) {position = [3]} : (vector<4xf32>) -> f32 | |
%857 = "vector.splat"(%856) : (f32) -> vector<4xf32> | |
%858 = "vector.fma"(%857, %786, %855) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%859 = "vector.extract"(%713) {position = [0]} : (vector<4xf32>) -> f32 | |
%860 = "vector.splat"(%859) : (f32) -> vector<4xf32> | |
%861 = "vector.fma"(%860, %788, %858) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%862 = "vector.extract"(%713) {position = [1]} : (vector<4xf32>) -> f32 | |
%863 = "vector.splat"(%862) : (f32) -> vector<4xf32> | |
%864 = "vector.fma"(%863, %790, %861) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%865 = "vector.extract"(%713) {position = [2]} : (vector<4xf32>) -> f32 | |
%866 = "vector.splat"(%865) : (f32) -> vector<4xf32> | |
%867 = "vector.fma"(%866, %792, %864) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%868 = "vector.extract"(%713) {position = [3]} : (vector<4xf32>) -> f32 | |
%869 = "vector.splat"(%868) : (f32) -> vector<4xf32> | |
%870 = "vector.fma"(%869, %794, %867) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%871 = "vector.extract"(%715) {position = [0]} : (vector<4xf32>) -> f32 | |
%872 = "vector.splat"(%871) : (f32) -> vector<4xf32> | |
%873 = "vector.fma"(%872, %796, %870) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%874 = "vector.extract"(%715) {position = [1]} : (vector<4xf32>) -> f32 | |
%875 = "vector.splat"(%874) : (f32) -> vector<4xf32> | |
%876 = "vector.fma"(%875, %798, %873) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%877 = "vector.extract"(%715) {position = [2]} : (vector<4xf32>) -> f32 | |
%878 = "vector.splat"(%877) : (f32) -> vector<4xf32> | |
%879 = "vector.fma"(%878, %800, %876) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%880 = "vector.extract"(%715) {position = [3]} : (vector<4xf32>) -> f32 | |
%881 = "vector.splat"(%880) : (f32) -> vector<4xf32> | |
%882 = "vector.fma"(%881, %802, %879) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%883 = "vector.extract"(%717) {position = [0]} : (vector<4xf32>) -> f32 | |
%884 = "vector.splat"(%883) : (f32) -> vector<4xf32> | |
%885 = "vector.fma"(%884, %804, %882) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%886 = "vector.extract"(%717) {position = [1]} : (vector<4xf32>) -> f32 | |
%887 = "vector.splat"(%886) : (f32) -> vector<4xf32> | |
%888 = "vector.fma"(%887, %806, %885) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%889 = "vector.extract"(%717) {position = [2]} : (vector<4xf32>) -> f32 | |
%890 = "vector.splat"(%889) : (f32) -> vector<4xf32> | |
%891 = "vector.fma"(%890, %808, %888) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%892 = "vector.extract"(%717) {position = [3]} : (vector<4xf32>) -> f32 | |
%893 = "vector.splat"(%892) : (f32) -> vector<4xf32> | |
%894 = "vector.fma"(%893, %810, %891) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%895 = "vector.extract"(%719) {position = [0]} : (vector<4xf32>) -> f32 | |
%896 = "vector.splat"(%895) : (f32) -> vector<4xf32> | |
%897 = "vector.fma"(%896, %812, %894) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%898 = "vector.extract"(%719) {position = [1]} : (vector<4xf32>) -> f32 | |
%899 = "vector.splat"(%898) : (f32) -> vector<4xf32> | |
%900 = "vector.fma"(%899, %814, %897) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%901 = "vector.extract"(%719) {position = [2]} : (vector<4xf32>) -> f32 | |
%902 = "vector.splat"(%901) : (f32) -> vector<4xf32> | |
%903 = "vector.fma"(%902, %816, %900) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%904 = "vector.extract"(%719) {position = [3]} : (vector<4xf32>) -> f32 | |
%905 = "vector.splat"(%904) : (f32) -> vector<4xf32> | |
%906 = "vector.fma"(%905, %818, %903) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%907 = "vector.extract"(%721) {position = [0]} : (vector<4xf32>) -> f32 | |
%908 = "vector.splat"(%907) : (f32) -> vector<4xf32> | |
%909 = "vector.fma"(%908, %820, %906) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%910 = "vector.extract"(%721) {position = [1]} : (vector<4xf32>) -> f32 | |
%911 = "vector.splat"(%910) : (f32) -> vector<4xf32> | |
%912 = "vector.fma"(%911, %822, %909) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%913 = "vector.extract"(%721) {position = [2]} : (vector<4xf32>) -> f32 | |
%914 = "vector.splat"(%913) : (f32) -> vector<4xf32> | |
%915 = "vector.fma"(%914, %824, %912) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%916 = "vector.extract"(%721) {position = [3]} : (vector<4xf32>) -> f32 | |
%917 = "vector.splat"(%916) : (f32) -> vector<4xf32> | |
%918 = "vector.fma"(%917, %826, %915) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%919 = "vector.extract"(%723) {position = [0]} : (vector<4xf32>) -> f32 | |
%920 = "vector.splat"(%919) : (f32) -> vector<4xf32> | |
%921 = "vector.fma"(%920, %828, %918) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%922 = "vector.extract"(%723) {position = [1]} : (vector<4xf32>) -> f32 | |
%923 = "vector.splat"(%922) : (f32) -> vector<4xf32> | |
%924 = "vector.fma"(%923, %830, %921) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%925 = "vector.extract"(%723) {position = [2]} : (vector<4xf32>) -> f32 | |
%926 = "vector.splat"(%925) : (f32) -> vector<4xf32> | |
%927 = "vector.fma"(%926, %832, %924) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%928 = "vector.extract"(%723) {position = [3]} : (vector<4xf32>) -> f32 | |
%929 = "vector.splat"(%928) : (f32) -> vector<4xf32> | |
%930 = "vector.fma"(%929, %834, %927) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%931 = "vector.extract"(%725) {position = [0]} : (vector<4xf32>) -> f32 | |
%932 = "vector.splat"(%931) : (f32) -> vector<4xf32> | |
%933 = "vector.fma"(%932, %772, %arg2) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%934 = "vector.extract"(%725) {position = [1]} : (vector<4xf32>) -> f32 | |
%935 = "vector.splat"(%934) : (f32) -> vector<4xf32> | |
%936 = "vector.fma"(%935, %774, %933) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%937 = "vector.extract"(%725) {position = [2]} : (vector<4xf32>) -> f32 | |
%938 = "vector.splat"(%937) : (f32) -> vector<4xf32> | |
%939 = "vector.fma"(%938, %776, %936) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%940 = "vector.extract"(%725) {position = [3]} : (vector<4xf32>) -> f32 | |
%941 = "vector.splat"(%940) : (f32) -> vector<4xf32> | |
%942 = "vector.fma"(%941, %778, %939) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%943 = "vector.extract"(%727) {position = [0]} : (vector<4xf32>) -> f32 | |
%944 = "vector.splat"(%943) : (f32) -> vector<4xf32> | |
%945 = "vector.fma"(%944, %780, %942) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%946 = "vector.extract"(%727) {position = [1]} : (vector<4xf32>) -> f32 | |
%947 = "vector.splat"(%946) : (f32) -> vector<4xf32> | |
%948 = "vector.fma"(%947, %782, %945) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%949 = "vector.extract"(%727) {position = [2]} : (vector<4xf32>) -> f32 | |
%950 = "vector.splat"(%949) : (f32) -> vector<4xf32> | |
%951 = "vector.fma"(%950, %784, %948) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%952 = "vector.extract"(%727) {position = [3]} : (vector<4xf32>) -> f32 | |
%953 = "vector.splat"(%952) : (f32) -> vector<4xf32> | |
%954 = "vector.fma"(%953, %786, %951) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%955 = "vector.extract"(%729) {position = [0]} : (vector<4xf32>) -> f32 | |
%956 = "vector.splat"(%955) : (f32) -> vector<4xf32> | |
%957 = "vector.fma"(%956, %788, %954) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%958 = "vector.extract"(%729) {position = [1]} : (vector<4xf32>) -> f32 | |
%959 = "vector.splat"(%958) : (f32) -> vector<4xf32> | |
%960 = "vector.fma"(%959, %790, %957) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%961 = "vector.extract"(%729) {position = [2]} : (vector<4xf32>) -> f32 | |
%962 = "vector.splat"(%961) : (f32) -> vector<4xf32> | |
%963 = "vector.fma"(%962, %792, %960) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%964 = "vector.extract"(%729) {position = [3]} : (vector<4xf32>) -> f32 | |
%965 = "vector.splat"(%964) : (f32) -> vector<4xf32> | |
%966 = "vector.fma"(%965, %794, %963) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%967 = "vector.extract"(%731) {position = [0]} : (vector<4xf32>) -> f32 | |
%968 = "vector.splat"(%967) : (f32) -> vector<4xf32> | |
%969 = "vector.fma"(%968, %796, %966) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%970 = "vector.extract"(%731) {position = [1]} : (vector<4xf32>) -> f32 | |
%971 = "vector.splat"(%970) : (f32) -> vector<4xf32> | |
%972 = "vector.fma"(%971, %798, %969) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%973 = "vector.extract"(%731) {position = [2]} : (vector<4xf32>) -> f32 | |
%974 = "vector.splat"(%973) : (f32) -> vector<4xf32> | |
%975 = "vector.fma"(%974, %800, %972) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%976 = "vector.extract"(%731) {position = [3]} : (vector<4xf32>) -> f32 | |
%977 = "vector.splat"(%976) : (f32) -> vector<4xf32> | |
%978 = "vector.fma"(%977, %802, %975) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%979 = "vector.extract"(%733) {position = [0]} : (vector<4xf32>) -> f32 | |
%980 = "vector.splat"(%979) : (f32) -> vector<4xf32> | |
%981 = "vector.fma"(%980, %804, %978) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%982 = "vector.extract"(%733) {position = [1]} : (vector<4xf32>) -> f32 | |
%983 = "vector.splat"(%982) : (f32) -> vector<4xf32> | |
%984 = "vector.fma"(%983, %806, %981) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%985 = "vector.extract"(%733) {position = [2]} : (vector<4xf32>) -> f32 | |
%986 = "vector.splat"(%985) : (f32) -> vector<4xf32> | |
%987 = "vector.fma"(%986, %808, %984) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%988 = "vector.extract"(%733) {position = [3]} : (vector<4xf32>) -> f32 | |
%989 = "vector.splat"(%988) : (f32) -> vector<4xf32> | |
%990 = "vector.fma"(%989, %810, %987) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%991 = "vector.extract"(%735) {position = [0]} : (vector<4xf32>) -> f32 | |
%992 = "vector.splat"(%991) : (f32) -> vector<4xf32> | |
%993 = "vector.fma"(%992, %812, %990) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%994 = "vector.extract"(%735) {position = [1]} : (vector<4xf32>) -> f32 | |
%995 = "vector.splat"(%994) : (f32) -> vector<4xf32> | |
%996 = "vector.fma"(%995, %814, %993) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%997 = "vector.extract"(%735) {position = [2]} : (vector<4xf32>) -> f32 | |
%998 = "vector.splat"(%997) : (f32) -> vector<4xf32> | |
%999 = "vector.fma"(%998, %816, %996) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1000 = "vector.extract"(%735) {position = [3]} : (vector<4xf32>) -> f32 | |
%1001 = "vector.splat"(%1000) : (f32) -> vector<4xf32> | |
%1002 = "vector.fma"(%1001, %818, %999) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1003 = "vector.extract"(%737) {position = [0]} : (vector<4xf32>) -> f32 | |
%1004 = "vector.splat"(%1003) : (f32) -> vector<4xf32> | |
%1005 = "vector.fma"(%1004, %820, %1002) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1006 = "vector.extract"(%737) {position = [1]} : (vector<4xf32>) -> f32 | |
%1007 = "vector.splat"(%1006) : (f32) -> vector<4xf32> | |
%1008 = "vector.fma"(%1007, %822, %1005) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1009 = "vector.extract"(%737) {position = [2]} : (vector<4xf32>) -> f32 | |
%1010 = "vector.splat"(%1009) : (f32) -> vector<4xf32> | |
%1011 = "vector.fma"(%1010, %824, %1008) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1012 = "vector.extract"(%737) {position = [3]} : (vector<4xf32>) -> f32 | |
%1013 = "vector.splat"(%1012) : (f32) -> vector<4xf32> | |
%1014 = "vector.fma"(%1013, %826, %1011) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1015 = "vector.extract"(%739) {position = [0]} : (vector<4xf32>) -> f32 | |
%1016 = "vector.splat"(%1015) : (f32) -> vector<4xf32> | |
%1017 = "vector.fma"(%1016, %828, %1014) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1018 = "vector.extract"(%739) {position = [1]} : (vector<4xf32>) -> f32 | |
%1019 = "vector.splat"(%1018) : (f32) -> vector<4xf32> | |
%1020 = "vector.fma"(%1019, %830, %1017) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1021 = "vector.extract"(%739) {position = [2]} : (vector<4xf32>) -> f32 | |
%1022 = "vector.splat"(%1021) : (f32) -> vector<4xf32> | |
%1023 = "vector.fma"(%1022, %832, %1020) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1024 = "vector.extract"(%739) {position = [3]} : (vector<4xf32>) -> f32 | |
%1025 = "vector.splat"(%1024) : (f32) -> vector<4xf32> | |
%1026 = "vector.fma"(%1025, %834, %1023) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1027 = "vector.extract"(%741) {position = [0]} : (vector<4xf32>) -> f32 | |
%1028 = "vector.splat"(%1027) : (f32) -> vector<4xf32> | |
%1029 = "vector.fma"(%1028, %772, %arg3) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1030 = "vector.extract"(%741) {position = [1]} : (vector<4xf32>) -> f32 | |
%1031 = "vector.splat"(%1030) : (f32) -> vector<4xf32> | |
%1032 = "vector.fma"(%1031, %774, %1029) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1033 = "vector.extract"(%741) {position = [2]} : (vector<4xf32>) -> f32 | |
%1034 = "vector.splat"(%1033) : (f32) -> vector<4xf32> | |
%1035 = "vector.fma"(%1034, %776, %1032) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1036 = "vector.extract"(%741) {position = [3]} : (vector<4xf32>) -> f32 | |
%1037 = "vector.splat"(%1036) : (f32) -> vector<4xf32> | |
%1038 = "vector.fma"(%1037, %778, %1035) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1039 = "vector.extract"(%743) {position = [0]} : (vector<4xf32>) -> f32 | |
%1040 = "vector.splat"(%1039) : (f32) -> vector<4xf32> | |
%1041 = "vector.fma"(%1040, %780, %1038) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1042 = "vector.extract"(%743) {position = [1]} : (vector<4xf32>) -> f32 | |
%1043 = "vector.splat"(%1042) : (f32) -> vector<4xf32> | |
%1044 = "vector.fma"(%1043, %782, %1041) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1045 = "vector.extract"(%743) {position = [2]} : (vector<4xf32>) -> f32 | |
%1046 = "vector.splat"(%1045) : (f32) -> vector<4xf32> | |
%1047 = "vector.fma"(%1046, %784, %1044) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1048 = "vector.extract"(%743) {position = [3]} : (vector<4xf32>) -> f32 | |
%1049 = "vector.splat"(%1048) : (f32) -> vector<4xf32> | |
%1050 = "vector.fma"(%1049, %786, %1047) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1051 = "vector.extract"(%745) {position = [0]} : (vector<4xf32>) -> f32 | |
%1052 = "vector.splat"(%1051) : (f32) -> vector<4xf32> | |
%1053 = "vector.fma"(%1052, %788, %1050) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1054 = "vector.extract"(%745) {position = [1]} : (vector<4xf32>) -> f32 | |
%1055 = "vector.splat"(%1054) : (f32) -> vector<4xf32> | |
%1056 = "vector.fma"(%1055, %790, %1053) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1057 = "vector.extract"(%745) {position = [2]} : (vector<4xf32>) -> f32 | |
%1058 = "vector.splat"(%1057) : (f32) -> vector<4xf32> | |
%1059 = "vector.fma"(%1058, %792, %1056) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1060 = "vector.extract"(%745) {position = [3]} : (vector<4xf32>) -> f32 | |
%1061 = "vector.splat"(%1060) : (f32) -> vector<4xf32> | |
%1062 = "vector.fma"(%1061, %794, %1059) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1063 = "vector.extract"(%747) {position = [0]} : (vector<4xf32>) -> f32 | |
%1064 = "vector.splat"(%1063) : (f32) -> vector<4xf32> | |
%1065 = "vector.fma"(%1064, %796, %1062) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1066 = "vector.extract"(%747) {position = [1]} : (vector<4xf32>) -> f32 | |
%1067 = "vector.splat"(%1066) : (f32) -> vector<4xf32> | |
%1068 = "vector.fma"(%1067, %798, %1065) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1069 = "vector.extract"(%747) {position = [2]} : (vector<4xf32>) -> f32 | |
%1070 = "vector.splat"(%1069) : (f32) -> vector<4xf32> | |
%1071 = "vector.fma"(%1070, %800, %1068) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1072 = "vector.extract"(%747) {position = [3]} : (vector<4xf32>) -> f32 | |
%1073 = "vector.splat"(%1072) : (f32) -> vector<4xf32> | |
%1074 = "vector.fma"(%1073, %802, %1071) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1075 = "vector.extract"(%749) {position = [0]} : (vector<4xf32>) -> f32 | |
%1076 = "vector.splat"(%1075) : (f32) -> vector<4xf32> | |
%1077 = "vector.fma"(%1076, %804, %1074) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1078 = "vector.extract"(%749) {position = [1]} : (vector<4xf32>) -> f32 | |
%1079 = "vector.splat"(%1078) : (f32) -> vector<4xf32> | |
%1080 = "vector.fma"(%1079, %806, %1077) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1081 = "vector.extract"(%749) {position = [2]} : (vector<4xf32>) -> f32 | |
%1082 = "vector.splat"(%1081) : (f32) -> vector<4xf32> | |
%1083 = "vector.fma"(%1082, %808, %1080) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1084 = "vector.extract"(%749) {position = [3]} : (vector<4xf32>) -> f32 | |
%1085 = "vector.splat"(%1084) : (f32) -> vector<4xf32> | |
%1086 = "vector.fma"(%1085, %810, %1083) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1087 = "vector.extract"(%751) {position = [0]} : (vector<4xf32>) -> f32 | |
%1088 = "vector.splat"(%1087) : (f32) -> vector<4xf32> | |
%1089 = "vector.fma"(%1088, %812, %1086) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1090 = "vector.extract"(%751) {position = [1]} : (vector<4xf32>) -> f32 | |
%1091 = "vector.splat"(%1090) : (f32) -> vector<4xf32> | |
%1092 = "vector.fma"(%1091, %814, %1089) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1093 = "vector.extract"(%751) {position = [2]} : (vector<4xf32>) -> f32 | |
%1094 = "vector.splat"(%1093) : (f32) -> vector<4xf32> | |
%1095 = "vector.fma"(%1094, %816, %1092) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1096 = "vector.extract"(%751) {position = [3]} : (vector<4xf32>) -> f32 | |
%1097 = "vector.splat"(%1096) : (f32) -> vector<4xf32> | |
%1098 = "vector.fma"(%1097, %818, %1095) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1099 = "vector.extract"(%753) {position = [0]} : (vector<4xf32>) -> f32 | |
%1100 = "vector.splat"(%1099) : (f32) -> vector<4xf32> | |
%1101 = "vector.fma"(%1100, %820, %1098) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1102 = "vector.extract"(%753) {position = [1]} : (vector<4xf32>) -> f32 | |
%1103 = "vector.splat"(%1102) : (f32) -> vector<4xf32> | |
%1104 = "vector.fma"(%1103, %822, %1101) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1105 = "vector.extract"(%753) {position = [2]} : (vector<4xf32>) -> f32 | |
%1106 = "vector.splat"(%1105) : (f32) -> vector<4xf32> | |
%1107 = "vector.fma"(%1106, %824, %1104) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1108 = "vector.extract"(%753) {position = [3]} : (vector<4xf32>) -> f32 | |
%1109 = "vector.splat"(%1108) : (f32) -> vector<4xf32> | |
%1110 = "vector.fma"(%1109, %826, %1107) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1111 = "vector.extract"(%755) {position = [0]} : (vector<4xf32>) -> f32 | |
%1112 = "vector.splat"(%1111) : (f32) -> vector<4xf32> | |
%1113 = "vector.fma"(%1112, %828, %1110) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1114 = "vector.extract"(%755) {position = [1]} : (vector<4xf32>) -> f32 | |
%1115 = "vector.splat"(%1114) : (f32) -> vector<4xf32> | |
%1116 = "vector.fma"(%1115, %830, %1113) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1117 = "vector.extract"(%755) {position = [2]} : (vector<4xf32>) -> f32 | |
%1118 = "vector.splat"(%1117) : (f32) -> vector<4xf32> | |
%1119 = "vector.fma"(%1118, %832, %1116) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1120 = "vector.extract"(%755) {position = [3]} : (vector<4xf32>) -> f32 | |
%1121 = "vector.splat"(%1120) : (f32) -> vector<4xf32> | |
%1122 = "vector.fma"(%1121, %834, %1119) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1123 = "vector.extract"(%757) {position = [0]} : (vector<4xf32>) -> f32 | |
%1124 = "vector.splat"(%1123) : (f32) -> vector<4xf32> | |
%1125 = "vector.fma"(%1124, %772, %arg4) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1126 = "vector.extract"(%757) {position = [1]} : (vector<4xf32>) -> f32 | |
%1127 = "vector.splat"(%1126) : (f32) -> vector<4xf32> | |
%1128 = "vector.fma"(%1127, %774, %1125) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1129 = "vector.extract"(%757) {position = [2]} : (vector<4xf32>) -> f32 | |
%1130 = "vector.splat"(%1129) : (f32) -> vector<4xf32> | |
%1131 = "vector.fma"(%1130, %776, %1128) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1132 = "vector.extract"(%757) {position = [3]} : (vector<4xf32>) -> f32 | |
%1133 = "vector.splat"(%1132) : (f32) -> vector<4xf32> | |
%1134 = "vector.fma"(%1133, %778, %1131) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1135 = "vector.extract"(%759) {position = [0]} : (vector<4xf32>) -> f32 | |
%1136 = "vector.splat"(%1135) : (f32) -> vector<4xf32> | |
%1137 = "vector.fma"(%1136, %780, %1134) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1138 = "vector.extract"(%759) {position = [1]} : (vector<4xf32>) -> f32 | |
%1139 = "vector.splat"(%1138) : (f32) -> vector<4xf32> | |
%1140 = "vector.fma"(%1139, %782, %1137) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1141 = "vector.extract"(%759) {position = [2]} : (vector<4xf32>) -> f32 | |
%1142 = "vector.splat"(%1141) : (f32) -> vector<4xf32> | |
%1143 = "vector.fma"(%1142, %784, %1140) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1144 = "vector.extract"(%759) {position = [3]} : (vector<4xf32>) -> f32 | |
%1145 = "vector.splat"(%1144) : (f32) -> vector<4xf32> | |
%1146 = "vector.fma"(%1145, %786, %1143) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1147 = "vector.extract"(%761) {position = [0]} : (vector<4xf32>) -> f32 | |
%1148 = "vector.splat"(%1147) : (f32) -> vector<4xf32> | |
%1149 = "vector.fma"(%1148, %788, %1146) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1150 = "vector.extract"(%761) {position = [1]} : (vector<4xf32>) -> f32 | |
%1151 = "vector.splat"(%1150) : (f32) -> vector<4xf32> | |
%1152 = "vector.fma"(%1151, %790, %1149) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1153 = "vector.extract"(%761) {position = [2]} : (vector<4xf32>) -> f32 | |
%1154 = "vector.splat"(%1153) : (f32) -> vector<4xf32> | |
%1155 = "vector.fma"(%1154, %792, %1152) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1156 = "vector.extract"(%761) {position = [3]} : (vector<4xf32>) -> f32 | |
%1157 = "vector.splat"(%1156) : (f32) -> vector<4xf32> | |
%1158 = "vector.fma"(%1157, %794, %1155) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1159 = "vector.extract"(%763) {position = [0]} : (vector<4xf32>) -> f32 | |
%1160 = "vector.splat"(%1159) : (f32) -> vector<4xf32> | |
%1161 = "vector.fma"(%1160, %796, %1158) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1162 = "vector.extract"(%763) {position = [1]} : (vector<4xf32>) -> f32 | |
%1163 = "vector.splat"(%1162) : (f32) -> vector<4xf32> | |
%1164 = "vector.fma"(%1163, %798, %1161) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1165 = "vector.extract"(%763) {position = [2]} : (vector<4xf32>) -> f32 | |
%1166 = "vector.splat"(%1165) : (f32) -> vector<4xf32> | |
%1167 = "vector.fma"(%1166, %800, %1164) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1168 = "vector.extract"(%763) {position = [3]} : (vector<4xf32>) -> f32 | |
%1169 = "vector.splat"(%1168) : (f32) -> vector<4xf32> | |
%1170 = "vector.fma"(%1169, %802, %1167) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1171 = "vector.extract"(%765) {position = [0]} : (vector<4xf32>) -> f32 | |
%1172 = "vector.splat"(%1171) : (f32) -> vector<4xf32> | |
%1173 = "vector.fma"(%1172, %804, %1170) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1174 = "vector.extract"(%765) {position = [1]} : (vector<4xf32>) -> f32 | |
%1175 = "vector.splat"(%1174) : (f32) -> vector<4xf32> | |
%1176 = "vector.fma"(%1175, %806, %1173) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1177 = "vector.extract"(%765) {position = [2]} : (vector<4xf32>) -> f32 | |
%1178 = "vector.splat"(%1177) : (f32) -> vector<4xf32> | |
%1179 = "vector.fma"(%1178, %808, %1176) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1180 = "vector.extract"(%765) {position = [3]} : (vector<4xf32>) -> f32 | |
%1181 = "vector.splat"(%1180) : (f32) -> vector<4xf32> | |
%1182 = "vector.fma"(%1181, %810, %1179) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1183 = "vector.extract"(%767) {position = [0]} : (vector<4xf32>) -> f32 | |
%1184 = "vector.splat"(%1183) : (f32) -> vector<4xf32> | |
%1185 = "vector.fma"(%1184, %812, %1182) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1186 = "vector.extract"(%767) {position = [1]} : (vector<4xf32>) -> f32 | |
%1187 = "vector.splat"(%1186) : (f32) -> vector<4xf32> | |
%1188 = "vector.fma"(%1187, %814, %1185) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1189 = "vector.extract"(%767) {position = [2]} : (vector<4xf32>) -> f32 | |
%1190 = "vector.splat"(%1189) : (f32) -> vector<4xf32> | |
%1191 = "vector.fma"(%1190, %816, %1188) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1192 = "vector.extract"(%767) {position = [3]} : (vector<4xf32>) -> f32 | |
%1193 = "vector.splat"(%1192) : (f32) -> vector<4xf32> | |
%1194 = "vector.fma"(%1193, %818, %1191) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1195 = "vector.extract"(%769) {position = [0]} : (vector<4xf32>) -> f32 | |
%1196 = "vector.splat"(%1195) : (f32) -> vector<4xf32> | |
%1197 = "vector.fma"(%1196, %820, %1194) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1198 = "vector.extract"(%769) {position = [1]} : (vector<4xf32>) -> f32 | |
%1199 = "vector.splat"(%1198) : (f32) -> vector<4xf32> | |
%1200 = "vector.fma"(%1199, %822, %1197) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1201 = "vector.extract"(%769) {position = [2]} : (vector<4xf32>) -> f32 | |
%1202 = "vector.splat"(%1201) : (f32) -> vector<4xf32> | |
%1203 = "vector.fma"(%1202, %824, %1200) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1204 = "vector.extract"(%769) {position = [3]} : (vector<4xf32>) -> f32 | |
%1205 = "vector.splat"(%1204) : (f32) -> vector<4xf32> | |
%1206 = "vector.fma"(%1205, %826, %1203) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1207 = "vector.extract"(%771) {position = [0]} : (vector<4xf32>) -> f32 | |
%1208 = "vector.splat"(%1207) : (f32) -> vector<4xf32> | |
%1209 = "vector.fma"(%1208, %828, %1206) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1210 = "vector.extract"(%771) {position = [1]} : (vector<4xf32>) -> f32 | |
%1211 = "vector.splat"(%1210) : (f32) -> vector<4xf32> | |
%1212 = "vector.fma"(%1211, %830, %1209) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1213 = "vector.extract"(%771) {position = [2]} : (vector<4xf32>) -> f32 | |
%1214 = "vector.splat"(%1213) : (f32) -> vector<4xf32> | |
%1215 = "vector.fma"(%1214, %832, %1212) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1216 = "vector.extract"(%771) {position = [3]} : (vector<4xf32>) -> f32 | |
%1217 = "vector.splat"(%1216) : (f32) -> vector<4xf32> | |
%1218 = "vector.fma"(%1217, %834, %1215) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1219 = "arith.addi"(%arg0, %80) : (index, index) -> index | |
%1220 = "arith.cmpi"(%1219, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
%1221 = "arith.subi"(%0, %arg0) : (index, index) -> index | |
%1222 = "arith.select"(%1220, %1221, %1219) : (i1, index, index) -> index | |
%1223 = "arith.divsi"(%1222, %53) : (index, index) -> index | |
%1224 = "arith.subi"(%70, %1223) : (index, index) -> index | |
%1225 = "arith.select"(%1220, %1224, %1223) : (i1, index, index) -> index | |
%1226 = "arith.addi"(%139, %1225) : (index, index) -> index | |
%1227 = "arith.addi"(%1226, %146) : (index, index) -> index | |
%1228 = "arith.addi"(%1227, %75) : (index, index) -> index | |
%1229 = "memref.load"(%93, %1228) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%1230 = "arith.addi"(%1227, %63) : (index, index) -> index | |
%1231 = "memref.load"(%94, %1230) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%1232 = "arith.muli"(%1219, %69) : (index, index) -> index | |
%1233 = "arith.addi"(%1232, %81) : (index, index) -> index | |
%1234 = "arith.addi"(%1233, %152) : (index, index) -> index | |
%1235 = "arith.addi"(%1234, %154) : (index, index) -> index | |
%1236 = "arith.addi"(%1235, %118) : (index, index) -> index | |
%1237 = "arith.addi"(%1236, %160) : (index, index) -> index | |
%1238 = "arith.addi"(%1237, %167) : (index, index) -> index | |
%1239 = "memref.load"(%98, %1238) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%1240 = "arith.addi"(%1238, %62) : (index, index) -> index | |
%1241 = "memref.load"(%99, %1240) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
"scf.yield"(%930, %1026, %1122, %1218, %1229, %1231, %1239, %1241) : (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> () | |
}) : (index, index, index, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) | |
"gpu.barrier"() : () -> () | |
%173 = "arith.muli"(%82, %60) : (index, index) -> index | |
%174 = "arith.addi"(%81, %173) : (index, index) -> index | |
%175 = "arith.muli"(%83, %77) : (index, index) -> index | |
%176 = "arith.addi"(%174, %175) : (index, index) -> index | |
%177 = "arith.addi"(%176, %145) : (index, index) -> index | |
"memref.store"(%172#4, %84, %177) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
%178 = "arith.addi"(%177, %77) : (index, index) -> index | |
"memref.store"(%172#5, %84, %178) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
%179 = "arith.muli"(%82, %59) : (index, index) -> index | |
%180 = "arith.addi"(%81, %179) : (index, index) -> index | |
%181 = "arith.muli"(%83, %58) : (index, index) -> index | |
%182 = "arith.addi"(%180, %181) : (index, index) -> index | |
%183 = "arith.addi"(%182, %159) : (index, index) -> index | |
"memref.store"(%172#6, %85, %183) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
%184 = "arith.addi"(%183, %58) : (index, index) -> index | |
"memref.store"(%172#7, %85, %184) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
"gpu.barrier"() : () -> () | |
%185 = "arith.muli"(%82, %57) : (index, index) -> index | |
%186 = "memref.load"(%84, %185) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%187 = "arith.addi"(%185, %56) : (index, index) -> index | |
%188 = "memref.load"(%84, %187) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%189 = "arith.addi"(%185, %55) : (index, index) -> index | |
%190 = "memref.load"(%84, %189) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%191 = "arith.addi"(%185, %54) : (index, index) -> index | |
%192 = "memref.load"(%84, %191) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%193 = "arith.addi"(%185, %53) : (index, index) -> index | |
%194 = "memref.load"(%84, %193) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%195 = "arith.addi"(%185, %52) : (index, index) -> index | |
%196 = "memref.load"(%84, %195) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%197 = "arith.addi"(%185, %51) : (index, index) -> index | |
%198 = "memref.load"(%84, %197) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%199 = "arith.addi"(%185, %50) : (index, index) -> index | |
%200 = "memref.load"(%84, %199) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%201 = "arith.addi"(%185, %49) : (index, index) -> index | |
%202 = "memref.load"(%84, %201) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%203 = "arith.addi"(%185, %48) : (index, index) -> index | |
%204 = "memref.load"(%84, %203) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%205 = "arith.addi"(%185, %47) : (index, index) -> index | |
%206 = "memref.load"(%84, %205) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%207 = "arith.addi"(%185, %46) : (index, index) -> index | |
%208 = "memref.load"(%84, %207) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%209 = "arith.addi"(%185, %45) : (index, index) -> index | |
%210 = "memref.load"(%84, %209) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%211 = "arith.addi"(%185, %44) : (index, index) -> index | |
%212 = "memref.load"(%84, %211) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%213 = "arith.addi"(%185, %43) : (index, index) -> index | |
%214 = "memref.load"(%84, %213) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%215 = "arith.addi"(%185, %71) : (index, index) -> index | |
%216 = "memref.load"(%84, %215) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%217 = "arith.addi"(%185, %60) : (index, index) -> index | |
%218 = "memref.load"(%84, %217) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%219 = "arith.addi"(%185, %42) : (index, index) -> index | |
%220 = "memref.load"(%84, %219) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%221 = "arith.addi"(%185, %41) : (index, index) -> index | |
%222 = "memref.load"(%84, %221) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%223 = "arith.addi"(%185, %40) : (index, index) -> index | |
%224 = "memref.load"(%84, %223) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%225 = "arith.addi"(%185, %39) : (index, index) -> index | |
%226 = "memref.load"(%84, %225) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%227 = "arith.addi"(%185, %38) : (index, index) -> index | |
%228 = "memref.load"(%84, %227) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%229 = "arith.addi"(%185, %37) : (index, index) -> index | |
%230 = "memref.load"(%84, %229) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%231 = "arith.addi"(%185, %36) : (index, index) -> index | |
%232 = "memref.load"(%84, %231) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%233 = "arith.addi"(%185, %35) : (index, index) -> index | |
%234 = "memref.load"(%84, %233) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%235 = "arith.addi"(%185, %34) : (index, index) -> index | |
%236 = "memref.load"(%84, %235) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%237 = "arith.addi"(%185, %33) : (index, index) -> index | |
%238 = "memref.load"(%84, %237) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%239 = "arith.addi"(%185, %32) : (index, index) -> index | |
%240 = "memref.load"(%84, %239) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%241 = "arith.addi"(%185, %31) : (index, index) -> index | |
%242 = "memref.load"(%84, %241) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%243 = "arith.addi"(%185, %80) : (index, index) -> index | |
%244 = "memref.load"(%84, %243) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%245 = "arith.addi"(%185, %30) : (index, index) -> index | |
%246 = "memref.load"(%84, %245) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%247 = "arith.addi"(%185, %29) : (index, index) -> index | |
%248 = "memref.load"(%84, %247) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%249 = "memref.load"(%85, %81) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%250 = "arith.addi"(%81, %59) : (index, index) -> index | |
%251 = "memref.load"(%85, %250) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%252 = "arith.addi"(%81, %29) : (index, index) -> index | |
%253 = "memref.load"(%85, %252) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%254 = "arith.addi"(%81, %28) : (index, index) -> index | |
%255 = "memref.load"(%85, %254) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%256 = "arith.addi"(%81, %27) : (index, index) -> index | |
%257 = "memref.load"(%85, %256) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%258 = "arith.addi"(%81, %26) : (index, index) -> index | |
%259 = "memref.load"(%85, %258) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%260 = "arith.addi"(%81, %25) : (index, index) -> index | |
%261 = "memref.load"(%85, %260) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%262 = "arith.addi"(%81, %24) : (index, index) -> index | |
%263 = "memref.load"(%85, %262) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%264 = "arith.addi"(%81, %23) : (index, index) -> index | |
%265 = "memref.load"(%85, %264) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%266 = "arith.addi"(%81, %22) : (index, index) -> index | |
%267 = "memref.load"(%85, %266) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%268 = "arith.addi"(%81, %21) : (index, index) -> index | |
%269 = "memref.load"(%85, %268) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%270 = "arith.addi"(%81, %20) : (index, index) -> index | |
%271 = "memref.load"(%85, %270) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%272 = "arith.addi"(%81, %19) : (index, index) -> index | |
%273 = "memref.load"(%85, %272) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%274 = "arith.addi"(%81, %18) : (index, index) -> index | |
%275 = "memref.load"(%85, %274) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%276 = "arith.addi"(%81, %17) : (index, index) -> index | |
%277 = "memref.load"(%85, %276) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%278 = "arith.addi"(%81, %16) : (index, index) -> index | |
%279 = "memref.load"(%85, %278) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%280 = "arith.addi"(%81, %58) : (index, index) -> index | |
%281 = "memref.load"(%85, %280) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%282 = "arith.addi"(%81, %15) : (index, index) -> index | |
%283 = "memref.load"(%85, %282) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%284 = "arith.addi"(%81, %14) : (index, index) -> index | |
%285 = "memref.load"(%85, %284) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%286 = "arith.addi"(%81, %13) : (index, index) -> index | |
%287 = "memref.load"(%85, %286) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%288 = "arith.addi"(%81, %12) : (index, index) -> index | |
%289 = "memref.load"(%85, %288) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%290 = "arith.addi"(%81, %11) : (index, index) -> index | |
%291 = "memref.load"(%85, %290) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%292 = "arith.addi"(%81, %10) : (index, index) -> index | |
%293 = "memref.load"(%85, %292) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%294 = "arith.addi"(%81, %9) : (index, index) -> index | |
%295 = "memref.load"(%85, %294) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%296 = "arith.addi"(%81, %8) : (index, index) -> index | |
%297 = "memref.load"(%85, %296) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%298 = "arith.addi"(%81, %7) : (index, index) -> index | |
%299 = "memref.load"(%85, %298) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%300 = "arith.addi"(%81, %6) : (index, index) -> index | |
%301 = "memref.load"(%85, %300) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%302 = "arith.addi"(%81, %5) : (index, index) -> index | |
%303 = "memref.load"(%85, %302) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%304 = "arith.addi"(%81, %4) : (index, index) -> index | |
%305 = "memref.load"(%85, %304) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%306 = "arith.addi"(%81, %3) : (index, index) -> index | |
%307 = "memref.load"(%85, %306) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%308 = "arith.addi"(%81, %2) : (index, index) -> index | |
%309 = "memref.load"(%85, %308) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%310 = "arith.addi"(%81, %1) : (index, index) -> index | |
%311 = "memref.load"(%85, %310) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%312 = "vector.extract"(%186) {position = [0]} : (vector<4xf32>) -> f32 | |
%313 = "vector.splat"(%312) : (f32) -> vector<4xf32> | |
%314 = "vector.fma"(%313, %249, %172#0) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%315 = "vector.extract"(%186) {position = [1]} : (vector<4xf32>) -> f32 | |
%316 = "vector.splat"(%315) : (f32) -> vector<4xf32> | |
%317 = "vector.fma"(%316, %251, %314) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%318 = "vector.extract"(%186) {position = [2]} : (vector<4xf32>) -> f32 | |
%319 = "vector.splat"(%318) : (f32) -> vector<4xf32> | |
%320 = "vector.fma"(%319, %253, %317) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%321 = "vector.extract"(%186) {position = [3]} : (vector<4xf32>) -> f32 | |
%322 = "vector.splat"(%321) : (f32) -> vector<4xf32> | |
%323 = "vector.fma"(%322, %255, %320) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%324 = "vector.extract"(%188) {position = [0]} : (vector<4xf32>) -> f32 | |
%325 = "vector.splat"(%324) : (f32) -> vector<4xf32> | |
%326 = "vector.fma"(%325, %257, %323) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%327 = "vector.extract"(%188) {position = [1]} : (vector<4xf32>) -> f32 | |
%328 = "vector.splat"(%327) : (f32) -> vector<4xf32> | |
%329 = "vector.fma"(%328, %259, %326) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%330 = "vector.extract"(%188) {position = [2]} : (vector<4xf32>) -> f32 | |
%331 = "vector.splat"(%330) : (f32) -> vector<4xf32> | |
%332 = "vector.fma"(%331, %261, %329) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%333 = "vector.extract"(%188) {position = [3]} : (vector<4xf32>) -> f32 | |
%334 = "vector.splat"(%333) : (f32) -> vector<4xf32> | |
%335 = "vector.fma"(%334, %263, %332) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%336 = "vector.extract"(%190) {position = [0]} : (vector<4xf32>) -> f32 | |
%337 = "vector.splat"(%336) : (f32) -> vector<4xf32> | |
%338 = "vector.fma"(%337, %265, %335) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%339 = "vector.extract"(%190) {position = [1]} : (vector<4xf32>) -> f32 | |
%340 = "vector.splat"(%339) : (f32) -> vector<4xf32> | |
%341 = "vector.fma"(%340, %267, %338) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%342 = "vector.extract"(%190) {position = [2]} : (vector<4xf32>) -> f32 | |
%343 = "vector.splat"(%342) : (f32) -> vector<4xf32> | |
%344 = "vector.fma"(%343, %269, %341) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%345 = "vector.extract"(%190) {position = [3]} : (vector<4xf32>) -> f32 | |
%346 = "vector.splat"(%345) : (f32) -> vector<4xf32> | |
%347 = "vector.fma"(%346, %271, %344) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%348 = "vector.extract"(%192) {position = [0]} : (vector<4xf32>) -> f32 | |
%349 = "vector.splat"(%348) : (f32) -> vector<4xf32> | |
%350 = "vector.fma"(%349, %273, %347) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%351 = "vector.extract"(%192) {position = [1]} : (vector<4xf32>) -> f32 | |
%352 = "vector.splat"(%351) : (f32) -> vector<4xf32> | |
%353 = "vector.fma"(%352, %275, %350) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%354 = "vector.extract"(%192) {position = [2]} : (vector<4xf32>) -> f32 | |
%355 = "vector.splat"(%354) : (f32) -> vector<4xf32> | |
%356 = "vector.fma"(%355, %277, %353) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%357 = "vector.extract"(%192) {position = [3]} : (vector<4xf32>) -> f32 | |
%358 = "vector.splat"(%357) : (f32) -> vector<4xf32> | |
%359 = "vector.fma"(%358, %279, %356) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%360 = "vector.extract"(%194) {position = [0]} : (vector<4xf32>) -> f32 | |
%361 = "vector.splat"(%360) : (f32) -> vector<4xf32> | |
%362 = "vector.fma"(%361, %281, %359) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%363 = "vector.extract"(%194) {position = [1]} : (vector<4xf32>) -> f32 | |
%364 = "vector.splat"(%363) : (f32) -> vector<4xf32> | |
%365 = "vector.fma"(%364, %283, %362) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%366 = "vector.extract"(%194) {position = [2]} : (vector<4xf32>) -> f32 | |
%367 = "vector.splat"(%366) : (f32) -> vector<4xf32> | |
%368 = "vector.fma"(%367, %285, %365) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%369 = "vector.extract"(%194) {position = [3]} : (vector<4xf32>) -> f32 | |
%370 = "vector.splat"(%369) : (f32) -> vector<4xf32> | |
%371 = "vector.fma"(%370, %287, %368) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%372 = "vector.extract"(%196) {position = [0]} : (vector<4xf32>) -> f32 | |
%373 = "vector.splat"(%372) : (f32) -> vector<4xf32> | |
%374 = "vector.fma"(%373, %289, %371) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%375 = "vector.extract"(%196) {position = [1]} : (vector<4xf32>) -> f32 | |
%376 = "vector.splat"(%375) : (f32) -> vector<4xf32> | |
%377 = "vector.fma"(%376, %291, %374) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%378 = "vector.extract"(%196) {position = [2]} : (vector<4xf32>) -> f32 | |
%379 = "vector.splat"(%378) : (f32) -> vector<4xf32> | |
%380 = "vector.fma"(%379, %293, %377) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%381 = "vector.extract"(%196) {position = [3]} : (vector<4xf32>) -> f32 | |
%382 = "vector.splat"(%381) : (f32) -> vector<4xf32> | |
%383 = "vector.fma"(%382, %295, %380) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%384 = "vector.extract"(%198) {position = [0]} : (vector<4xf32>) -> f32 | |
%385 = "vector.splat"(%384) : (f32) -> vector<4xf32> | |
%386 = "vector.fma"(%385, %297, %383) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%387 = "vector.extract"(%198) {position = [1]} : (vector<4xf32>) -> f32 | |
%388 = "vector.splat"(%387) : (f32) -> vector<4xf32> | |
%389 = "vector.fma"(%388, %299, %386) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%390 = "vector.extract"(%198) {position = [2]} : (vector<4xf32>) -> f32 | |
%391 = "vector.splat"(%390) : (f32) -> vector<4xf32> | |
%392 = "vector.fma"(%391, %301, %389) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%393 = "vector.extract"(%198) {position = [3]} : (vector<4xf32>) -> f32 | |
%394 = "vector.splat"(%393) : (f32) -> vector<4xf32> | |
%395 = "vector.fma"(%394, %303, %392) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%396 = "vector.extract"(%200) {position = [0]} : (vector<4xf32>) -> f32 | |
%397 = "vector.splat"(%396) : (f32) -> vector<4xf32> | |
%398 = "vector.fma"(%397, %305, %395) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%399 = "vector.extract"(%200) {position = [1]} : (vector<4xf32>) -> f32 | |
%400 = "vector.splat"(%399) : (f32) -> vector<4xf32> | |
%401 = "vector.fma"(%400, %307, %398) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%402 = "vector.extract"(%200) {position = [2]} : (vector<4xf32>) -> f32 | |
%403 = "vector.splat"(%402) : (f32) -> vector<4xf32> | |
%404 = "vector.fma"(%403, %309, %401) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%405 = "vector.extract"(%200) {position = [3]} : (vector<4xf32>) -> f32 | |
%406 = "vector.splat"(%405) : (f32) -> vector<4xf32> | |
%407 = "vector.fma"(%406, %311, %404) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%408 = "vector.extract"(%202) {position = [0]} : (vector<4xf32>) -> f32 | |
%409 = "vector.splat"(%408) : (f32) -> vector<4xf32> | |
%410 = "vector.fma"(%409, %249, %172#1) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%411 = "vector.extract"(%202) {position = [1]} : (vector<4xf32>) -> f32 | |
%412 = "vector.splat"(%411) : (f32) -> vector<4xf32> | |
%413 = "vector.fma"(%412, %251, %410) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%414 = "vector.extract"(%202) {position = [2]} : (vector<4xf32>) -> f32 | |
%415 = "vector.splat"(%414) : (f32) -> vector<4xf32> | |
%416 = "vector.fma"(%415, %253, %413) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%417 = "vector.extract"(%202) {position = [3]} : (vector<4xf32>) -> f32 | |
%418 = "vector.splat"(%417) : (f32) -> vector<4xf32> | |
%419 = "vector.fma"(%418, %255, %416) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%420 = "vector.extract"(%204) {position = [0]} : (vector<4xf32>) -> f32 | |
%421 = "vector.splat"(%420) : (f32) -> vector<4xf32> | |
%422 = "vector.fma"(%421, %257, %419) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%423 = "vector.extract"(%204) {position = [1]} : (vector<4xf32>) -> f32 | |
%424 = "vector.splat"(%423) : (f32) -> vector<4xf32> | |
%425 = "vector.fma"(%424, %259, %422) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%426 = "vector.extract"(%204) {position = [2]} : (vector<4xf32>) -> f32 | |
%427 = "vector.splat"(%426) : (f32) -> vector<4xf32> | |
%428 = "vector.fma"(%427, %261, %425) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%429 = "vector.extract"(%204) {position = [3]} : (vector<4xf32>) -> f32 | |
%430 = "vector.splat"(%429) : (f32) -> vector<4xf32> | |
%431 = "vector.fma"(%430, %263, %428) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%432 = "vector.extract"(%206) {position = [0]} : (vector<4xf32>) -> f32 | |
%433 = "vector.splat"(%432) : (f32) -> vector<4xf32> | |
%434 = "vector.fma"(%433, %265, %431) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%435 = "vector.extract"(%206) {position = [1]} : (vector<4xf32>) -> f32 | |
%436 = "vector.splat"(%435) : (f32) -> vector<4xf32> | |
%437 = "vector.fma"(%436, %267, %434) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%438 = "vector.extract"(%206) {position = [2]} : (vector<4xf32>) -> f32 | |
%439 = "vector.splat"(%438) : (f32) -> vector<4xf32> | |
%440 = "vector.fma"(%439, %269, %437) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%441 = "vector.extract"(%206) {position = [3]} : (vector<4xf32>) -> f32 | |
%442 = "vector.splat"(%441) : (f32) -> vector<4xf32> | |
%443 = "vector.fma"(%442, %271, %440) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%444 = "vector.extract"(%208) {position = [0]} : (vector<4xf32>) -> f32 | |
%445 = "vector.splat"(%444) : (f32) -> vector<4xf32> | |
%446 = "vector.fma"(%445, %273, %443) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%447 = "vector.extract"(%208) {position = [1]} : (vector<4xf32>) -> f32 | |
%448 = "vector.splat"(%447) : (f32) -> vector<4xf32> | |
%449 = "vector.fma"(%448, %275, %446) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%450 = "vector.extract"(%208) {position = [2]} : (vector<4xf32>) -> f32 | |
%451 = "vector.splat"(%450) : (f32) -> vector<4xf32> | |
%452 = "vector.fma"(%451, %277, %449) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%453 = "vector.extract"(%208) {position = [3]} : (vector<4xf32>) -> f32 | |
%454 = "vector.splat"(%453) : (f32) -> vector<4xf32> | |
%455 = "vector.fma"(%454, %279, %452) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%456 = "vector.extract"(%210) {position = [0]} : (vector<4xf32>) -> f32 | |
%457 = "vector.splat"(%456) : (f32) -> vector<4xf32> | |
%458 = "vector.fma"(%457, %281, %455) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%459 = "vector.extract"(%210) {position = [1]} : (vector<4xf32>) -> f32 | |
%460 = "vector.splat"(%459) : (f32) -> vector<4xf32> | |
%461 = "vector.fma"(%460, %283, %458) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%462 = "vector.extract"(%210) {position = [2]} : (vector<4xf32>) -> f32 | |
%463 = "vector.splat"(%462) : (f32) -> vector<4xf32> | |
%464 = "vector.fma"(%463, %285, %461) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%465 = "vector.extract"(%210) {position = [3]} : (vector<4xf32>) -> f32 | |
%466 = "vector.splat"(%465) : (f32) -> vector<4xf32> | |
%467 = "vector.fma"(%466, %287, %464) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%468 = "vector.extract"(%212) {position = [0]} : (vector<4xf32>) -> f32 | |
%469 = "vector.splat"(%468) : (f32) -> vector<4xf32> | |
%470 = "vector.fma"(%469, %289, %467) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%471 = "vector.extract"(%212) {position = [1]} : (vector<4xf32>) -> f32 | |
%472 = "vector.splat"(%471) : (f32) -> vector<4xf32> | |
%473 = "vector.fma"(%472, %291, %470) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%474 = "vector.extract"(%212) {position = [2]} : (vector<4xf32>) -> f32 | |
%475 = "vector.splat"(%474) : (f32) -> vector<4xf32> | |
%476 = "vector.fma"(%475, %293, %473) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%477 = "vector.extract"(%212) {position = [3]} : (vector<4xf32>) -> f32 | |
%478 = "vector.splat"(%477) : (f32) -> vector<4xf32> | |
%479 = "vector.fma"(%478, %295, %476) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%480 = "vector.extract"(%214) {position = [0]} : (vector<4xf32>) -> f32 | |
%481 = "vector.splat"(%480) : (f32) -> vector<4xf32> | |
%482 = "vector.fma"(%481, %297, %479) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%483 = "vector.extract"(%214) {position = [1]} : (vector<4xf32>) -> f32 | |
%484 = "vector.splat"(%483) : (f32) -> vector<4xf32> | |
%485 = "vector.fma"(%484, %299, %482) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%486 = "vector.extract"(%214) {position = [2]} : (vector<4xf32>) -> f32 | |
%487 = "vector.splat"(%486) : (f32) -> vector<4xf32> | |
%488 = "vector.fma"(%487, %301, %485) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%489 = "vector.extract"(%214) {position = [3]} : (vector<4xf32>) -> f32 | |
%490 = "vector.splat"(%489) : (f32) -> vector<4xf32> | |
%491 = "vector.fma"(%490, %303, %488) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%492 = "vector.extract"(%216) {position = [0]} : (vector<4xf32>) -> f32 | |
%493 = "vector.splat"(%492) : (f32) -> vector<4xf32> | |
%494 = "vector.fma"(%493, %305, %491) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%495 = "vector.extract"(%216) {position = [1]} : (vector<4xf32>) -> f32 | |
%496 = "vector.splat"(%495) : (f32) -> vector<4xf32> | |
%497 = "vector.fma"(%496, %307, %494) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%498 = "vector.extract"(%216) {position = [2]} : (vector<4xf32>) -> f32 | |
%499 = "vector.splat"(%498) : (f32) -> vector<4xf32> | |
%500 = "vector.fma"(%499, %309, %497) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%501 = "vector.extract"(%216) {position = [3]} : (vector<4xf32>) -> f32 | |
%502 = "vector.splat"(%501) : (f32) -> vector<4xf32> | |
%503 = "vector.fma"(%502, %311, %500) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%504 = "vector.extract"(%218) {position = [0]} : (vector<4xf32>) -> f32 | |
%505 = "vector.splat"(%504) : (f32) -> vector<4xf32> | |
%506 = "vector.fma"(%505, %249, %172#2) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%507 = "vector.extract"(%218) {position = [1]} : (vector<4xf32>) -> f32 | |
%508 = "vector.splat"(%507) : (f32) -> vector<4xf32> | |
%509 = "vector.fma"(%508, %251, %506) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%510 = "vector.extract"(%218) {position = [2]} : (vector<4xf32>) -> f32 | |
%511 = "vector.splat"(%510) : (f32) -> vector<4xf32> | |
%512 = "vector.fma"(%511, %253, %509) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%513 = "vector.extract"(%218) {position = [3]} : (vector<4xf32>) -> f32 | |
%514 = "vector.splat"(%513) : (f32) -> vector<4xf32> | |
%515 = "vector.fma"(%514, %255, %512) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%516 = "vector.extract"(%220) {position = [0]} : (vector<4xf32>) -> f32 | |
%517 = "vector.splat"(%516) : (f32) -> vector<4xf32> | |
%518 = "vector.fma"(%517, %257, %515) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%519 = "vector.extract"(%220) {position = [1]} : (vector<4xf32>) -> f32 | |
%520 = "vector.splat"(%519) : (f32) -> vector<4xf32> | |
%521 = "vector.fma"(%520, %259, %518) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%522 = "vector.extract"(%220) {position = [2]} : (vector<4xf32>) -> f32 | |
%523 = "vector.splat"(%522) : (f32) -> vector<4xf32> | |
%524 = "vector.fma"(%523, %261, %521) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%525 = "vector.extract"(%220) {position = [3]} : (vector<4xf32>) -> f32 | |
%526 = "vector.splat"(%525) : (f32) -> vector<4xf32> | |
%527 = "vector.fma"(%526, %263, %524) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%528 = "vector.extract"(%222) {position = [0]} : (vector<4xf32>) -> f32 | |
%529 = "vector.splat"(%528) : (f32) -> vector<4xf32> | |
%530 = "vector.fma"(%529, %265, %527) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%531 = "vector.extract"(%222) {position = [1]} : (vector<4xf32>) -> f32 | |
%532 = "vector.splat"(%531) : (f32) -> vector<4xf32> | |
%533 = "vector.fma"(%532, %267, %530) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%534 = "vector.extract"(%222) {position = [2]} : (vector<4xf32>) -> f32 | |
%535 = "vector.splat"(%534) : (f32) -> vector<4xf32> | |
%536 = "vector.fma"(%535, %269, %533) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%537 = "vector.extract"(%222) {position = [3]} : (vector<4xf32>) -> f32 | |
%538 = "vector.splat"(%537) : (f32) -> vector<4xf32> | |
%539 = "vector.fma"(%538, %271, %536) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%540 = "vector.extract"(%224) {position = [0]} : (vector<4xf32>) -> f32 | |
%541 = "vector.splat"(%540) : (f32) -> vector<4xf32> | |
%542 = "vector.fma"(%541, %273, %539) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%543 = "vector.extract"(%224) {position = [1]} : (vector<4xf32>) -> f32 | |
%544 = "vector.splat"(%543) : (f32) -> vector<4xf32> | |
%545 = "vector.fma"(%544, %275, %542) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%546 = "vector.extract"(%224) {position = [2]} : (vector<4xf32>) -> f32 | |
%547 = "vector.splat"(%546) : (f32) -> vector<4xf32> | |
%548 = "vector.fma"(%547, %277, %545) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%549 = "vector.extract"(%224) {position = [3]} : (vector<4xf32>) -> f32 | |
%550 = "vector.splat"(%549) : (f32) -> vector<4xf32> | |
%551 = "vector.fma"(%550, %279, %548) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%552 = "vector.extract"(%226) {position = [0]} : (vector<4xf32>) -> f32 | |
%553 = "vector.splat"(%552) : (f32) -> vector<4xf32> | |
%554 = "vector.fma"(%553, %281, %551) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%555 = "vector.extract"(%226) {position = [1]} : (vector<4xf32>) -> f32 | |
%556 = "vector.splat"(%555) : (f32) -> vector<4xf32> | |
%557 = "vector.fma"(%556, %283, %554) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%558 = "vector.extract"(%226) {position = [2]} : (vector<4xf32>) -> f32 | |
%559 = "vector.splat"(%558) : (f32) -> vector<4xf32> | |
%560 = "vector.fma"(%559, %285, %557) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%561 = "vector.extract"(%226) {position = [3]} : (vector<4xf32>) -> f32 | |
%562 = "vector.splat"(%561) : (f32) -> vector<4xf32> | |
%563 = "vector.fma"(%562, %287, %560) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%564 = "vector.extract"(%228) {position = [0]} : (vector<4xf32>) -> f32 | |
%565 = "vector.splat"(%564) : (f32) -> vector<4xf32> | |
%566 = "vector.fma"(%565, %289, %563) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%567 = "vector.extract"(%228) {position = [1]} : (vector<4xf32>) -> f32 | |
%568 = "vector.splat"(%567) : (f32) -> vector<4xf32> | |
%569 = "vector.fma"(%568, %291, %566) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%570 = "vector.extract"(%228) {position = [2]} : (vector<4xf32>) -> f32 | |
%571 = "vector.splat"(%570) : (f32) -> vector<4xf32> | |
%572 = "vector.fma"(%571, %293, %569) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%573 = "vector.extract"(%228) {position = [3]} : (vector<4xf32>) -> f32 | |
%574 = "vector.splat"(%573) : (f32) -> vector<4xf32> | |
%575 = "vector.fma"(%574, %295, %572) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%576 = "vector.extract"(%230) {position = [0]} : (vector<4xf32>) -> f32 | |
%577 = "vector.splat"(%576) : (f32) -> vector<4xf32> | |
%578 = "vector.fma"(%577, %297, %575) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%579 = "vector.extract"(%230) {position = [1]} : (vector<4xf32>) -> f32 | |
%580 = "vector.splat"(%579) : (f32) -> vector<4xf32> | |
%581 = "vector.fma"(%580, %299, %578) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%582 = "vector.extract"(%230) {position = [2]} : (vector<4xf32>) -> f32 | |
%583 = "vector.splat"(%582) : (f32) -> vector<4xf32> | |
%584 = "vector.fma"(%583, %301, %581) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%585 = "vector.extract"(%230) {position = [3]} : (vector<4xf32>) -> f32 | |
%586 = "vector.splat"(%585) : (f32) -> vector<4xf32> | |
%587 = "vector.fma"(%586, %303, %584) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%588 = "vector.extract"(%232) {position = [0]} : (vector<4xf32>) -> f32 | |
%589 = "vector.splat"(%588) : (f32) -> vector<4xf32> | |
%590 = "vector.fma"(%589, %305, %587) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%591 = "vector.extract"(%232) {position = [1]} : (vector<4xf32>) -> f32 | |
%592 = "vector.splat"(%591) : (f32) -> vector<4xf32> | |
%593 = "vector.fma"(%592, %307, %590) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%594 = "vector.extract"(%232) {position = [2]} : (vector<4xf32>) -> f32 | |
%595 = "vector.splat"(%594) : (f32) -> vector<4xf32> | |
%596 = "vector.fma"(%595, %309, %593) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%597 = "vector.extract"(%232) {position = [3]} : (vector<4xf32>) -> f32 | |
%598 = "vector.splat"(%597) : (f32) -> vector<4xf32> | |
%599 = "vector.fma"(%598, %311, %596) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%600 = "vector.extract"(%234) {position = [0]} : (vector<4xf32>) -> f32 | |
%601 = "vector.splat"(%600) : (f32) -> vector<4xf32> | |
%602 = "vector.fma"(%601, %249, %172#3) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%603 = "vector.extract"(%234) {position = [1]} : (vector<4xf32>) -> f32 | |
%604 = "vector.splat"(%603) : (f32) -> vector<4xf32> | |
%605 = "vector.fma"(%604, %251, %602) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%606 = "vector.extract"(%234) {position = [2]} : (vector<4xf32>) -> f32 | |
%607 = "vector.splat"(%606) : (f32) -> vector<4xf32> | |
%608 = "vector.fma"(%607, %253, %605) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%609 = "vector.extract"(%234) {position = [3]} : (vector<4xf32>) -> f32 | |
%610 = "vector.splat"(%609) : (f32) -> vector<4xf32> | |
%611 = "vector.fma"(%610, %255, %608) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%612 = "vector.extract"(%236) {position = [0]} : (vector<4xf32>) -> f32 | |
%613 = "vector.splat"(%612) : (f32) -> vector<4xf32> | |
%614 = "vector.fma"(%613, %257, %611) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%615 = "vector.extract"(%236) {position = [1]} : (vector<4xf32>) -> f32 | |
%616 = "vector.splat"(%615) : (f32) -> vector<4xf32> | |
%617 = "vector.fma"(%616, %259, %614) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%618 = "vector.extract"(%236) {position = [2]} : (vector<4xf32>) -> f32 | |
%619 = "vector.splat"(%618) : (f32) -> vector<4xf32> | |
%620 = "vector.fma"(%619, %261, %617) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%621 = "vector.extract"(%236) {position = [3]} : (vector<4xf32>) -> f32 | |
%622 = "vector.splat"(%621) : (f32) -> vector<4xf32> | |
%623 = "vector.fma"(%622, %263, %620) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%624 = "vector.extract"(%238) {position = [0]} : (vector<4xf32>) -> f32 | |
%625 = "vector.splat"(%624) : (f32) -> vector<4xf32> | |
%626 = "vector.fma"(%625, %265, %623) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%627 = "vector.extract"(%238) {position = [1]} : (vector<4xf32>) -> f32 | |
%628 = "vector.splat"(%627) : (f32) -> vector<4xf32> | |
%629 = "vector.fma"(%628, %267, %626) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%630 = "vector.extract"(%238) {position = [2]} : (vector<4xf32>) -> f32 | |
%631 = "vector.splat"(%630) : (f32) -> vector<4xf32> | |
%632 = "vector.fma"(%631, %269, %629) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%633 = "vector.extract"(%238) {position = [3]} : (vector<4xf32>) -> f32 | |
%634 = "vector.splat"(%633) : (f32) -> vector<4xf32> | |
%635 = "vector.fma"(%634, %271, %632) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%636 = "vector.extract"(%240) {position = [0]} : (vector<4xf32>) -> f32 | |
%637 = "vector.splat"(%636) : (f32) -> vector<4xf32> | |
%638 = "vector.fma"(%637, %273, %635) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%639 = "vector.extract"(%240) {position = [1]} : (vector<4xf32>) -> f32 | |
%640 = "vector.splat"(%639) : (f32) -> vector<4xf32> | |
%641 = "vector.fma"(%640, %275, %638) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%642 = "vector.extract"(%240) {position = [2]} : (vector<4xf32>) -> f32 | |
%643 = "vector.splat"(%642) : (f32) -> vector<4xf32> | |
%644 = "vector.fma"(%643, %277, %641) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%645 = "vector.extract"(%240) {position = [3]} : (vector<4xf32>) -> f32 | |
%646 = "vector.splat"(%645) : (f32) -> vector<4xf32> | |
%647 = "vector.fma"(%646, %279, %644) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%648 = "vector.extract"(%242) {position = [0]} : (vector<4xf32>) -> f32 | |
%649 = "vector.splat"(%648) : (f32) -> vector<4xf32> | |
%650 = "vector.fma"(%649, %281, %647) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%651 = "vector.extract"(%242) {position = [1]} : (vector<4xf32>) -> f32 | |
%652 = "vector.splat"(%651) : (f32) -> vector<4xf32> | |
%653 = "vector.fma"(%652, %283, %650) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%654 = "vector.extract"(%242) {position = [2]} : (vector<4xf32>) -> f32 | |
%655 = "vector.splat"(%654) : (f32) -> vector<4xf32> | |
%656 = "vector.fma"(%655, %285, %653) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%657 = "vector.extract"(%242) {position = [3]} : (vector<4xf32>) -> f32 | |
%658 = "vector.splat"(%657) : (f32) -> vector<4xf32> | |
%659 = "vector.fma"(%658, %287, %656) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%660 = "vector.extract"(%244) {position = [0]} : (vector<4xf32>) -> f32 | |
%661 = "vector.splat"(%660) : (f32) -> vector<4xf32> | |
%662 = "vector.fma"(%661, %289, %659) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%663 = "vector.extract"(%244) {position = [1]} : (vector<4xf32>) -> f32 | |
%664 = "vector.splat"(%663) : (f32) -> vector<4xf32> | |
%665 = "vector.fma"(%664, %291, %662) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%666 = "vector.extract"(%244) {position = [2]} : (vector<4xf32>) -> f32 | |
%667 = "vector.splat"(%666) : (f32) -> vector<4xf32> | |
%668 = "vector.fma"(%667, %293, %665) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%669 = "vector.extract"(%244) {position = [3]} : (vector<4xf32>) -> f32 | |
%670 = "vector.splat"(%669) : (f32) -> vector<4xf32> | |
%671 = "vector.fma"(%670, %295, %668) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%672 = "vector.extract"(%246) {position = [0]} : (vector<4xf32>) -> f32 | |
%673 = "vector.splat"(%672) : (f32) -> vector<4xf32> | |
%674 = "vector.fma"(%673, %297, %671) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%675 = "vector.extract"(%246) {position = [1]} : (vector<4xf32>) -> f32 | |
%676 = "vector.splat"(%675) : (f32) -> vector<4xf32> | |
%677 = "vector.fma"(%676, %299, %674) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%678 = "vector.extract"(%246) {position = [2]} : (vector<4xf32>) -> f32 | |
%679 = "vector.splat"(%678) : (f32) -> vector<4xf32> | |
%680 = "vector.fma"(%679, %301, %677) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%681 = "vector.extract"(%246) {position = [3]} : (vector<4xf32>) -> f32 | |
%682 = "vector.splat"(%681) : (f32) -> vector<4xf32> | |
%683 = "vector.fma"(%682, %303, %680) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%684 = "vector.extract"(%248) {position = [0]} : (vector<4xf32>) -> f32 | |
%685 = "vector.splat"(%684) : (f32) -> vector<4xf32> | |
%686 = "vector.fma"(%685, %305, %683) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%687 = "vector.extract"(%248) {position = [1]} : (vector<4xf32>) -> f32 | |
%688 = "vector.splat"(%687) : (f32) -> vector<4xf32> | |
%689 = "vector.fma"(%688, %307, %686) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%690 = "vector.extract"(%248) {position = [2]} : (vector<4xf32>) -> f32 | |
%691 = "vector.splat"(%690) : (f32) -> vector<4xf32> | |
%692 = "vector.fma"(%691, %309, %689) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%693 = "vector.extract"(%248) {position = [3]} : (vector<4xf32>) -> f32 | |
%694 = "vector.splat"(%693) : (f32) -> vector<4xf32> | |
%695 = "vector.fma"(%694, %311, %692) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
"memref.store"(%695, %109, %130) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"memref.store"(%599, %110, %129) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"memref.store"(%503, %111, %128) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"memref.store"(%407, %112, %127) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [16, 16, 1]>, sym_name = "forward_dispatch_35_matmul_18432x320x320"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
%133 = linalg.matmul ins(%collapsed_749, %130 : tensor<18432x320xf32>, tensor<320x320xf32>) outs(%132 : tensor<18432x320xf32>) -> tensor<18432x320xf32> | |
^ | |
/home/prashant/stable.mlir:1320:12: error: failed to serialize executables | |
%133 = linalg.matmul ins(%collapsed_749, %130 : tensor<18432x320xf32>, tensor<320x320xf32>) outs(%132 : tensor<18432x320xf32>) -> tensor<18432x320xf32> | |
^ | |
/home/prashant/stable.mlir:24:3: note: called from | |
func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
^ | |
/home/prashant/stable.mlir:1320:12: note: see current operation: | |
"hal.executable"() ({ | |
"hal.executable.variant"() ({ | |
"hal.executable.export"() ({ | |
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
%0 = "arith.constant"() {value = 5 : index} : () -> index | |
%1 = "arith.constant"() {value = 288 : index} : () -> index | |
%2 = "arith.constant"() {value = 1 : index} : () -> index | |
"hal.return"(%0, %1, %2) : (index, index, index) -> () | |
}) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_35_matmul_18432x320x320", translation_info = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize pipeline_depth = 1>, workgroup_size = [16 : index, 16 : index, 1 : index]} : () -> () | |
"builtin.module"() ({ | |
"spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"spirv.GlobalVariable"() {binding = 2 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_2_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
"func.func"() ({ | |
%0 = "arith.constant"() {value = -33 : index} : () -> index | |
%1 = "arith.constant"() {value = 527 : index} : () -> index | |
%2 = "arith.constant"() {value = 510 : index} : () -> index | |
%3 = "arith.constant"() {value = 493 : index} : () -> index | |
%4 = "arith.constant"() {value = 476 : index} : () -> index | |
%5 = "arith.constant"() {value = 459 : index} : () -> index | |
%6 = "arith.constant"() {value = 442 : index} : () -> index | |
%7 = "arith.constant"() {value = 425 : index} : () -> index | |
%8 = "arith.constant"() {value = 408 : index} : () -> index | |
%9 = "arith.constant"() {value = 391 : index} : () -> index | |
%10 = "arith.constant"() {value = 374 : index} : () -> index | |
%11 = "arith.constant"() {value = 357 : index} : () -> index | |
%12 = "arith.constant"() {value = 340 : index} : () -> index | |
%13 = "arith.constant"() {value = 323 : index} : () -> index | |
%14 = "arith.constant"() {value = 306 : index} : () -> index | |
%15 = "arith.constant"() {value = 289 : index} : () -> index | |
%16 = "arith.constant"() {value = 255 : index} : () -> index | |
%17 = "arith.constant"() {value = 238 : index} : () -> index | |
%18 = "arith.constant"() {value = 221 : index} : () -> index | |
%19 = "arith.constant"() {value = 204 : index} : () -> index | |
%20 = "arith.constant"() {value = 187 : index} : () -> index | |
%21 = "arith.constant"() {value = 170 : index} : () -> index | |
%22 = "arith.constant"() {value = 153 : index} : () -> index | |
%23 = "arith.constant"() {value = 136 : index} : () -> index | |
%24 = "arith.constant"() {value = 119 : index} : () -> index | |
%25 = "arith.constant"() {value = 102 : index} : () -> index | |
%26 = "arith.constant"() {value = 85 : index} : () -> index | |
%27 = "arith.constant"() {value = 68 : index} : () -> index | |
%28 = "arith.constant"() {value = 51 : index} : () -> index | |
%29 = "arith.constant"() {value = 34 : index} : () -> index | |
%30 = "arith.constant"() {value = 33 : index} : () -> index | |
%31 = "arith.constant"() {value = 31 : index} : () -> index | |
%32 = "arith.constant"() {value = 30 : index} : () -> index | |
%33 = "arith.constant"() {value = 29 : index} : () -> index | |
%34 = "arith.constant"() {value = 28 : index} : () -> index | |
%35 = "arith.constant"() {value = 27 : index} : () -> index | |
%36 = "arith.constant"() {value = 25 : index} : () -> index | |
%37 = "arith.constant"() {value = 24 : index} : () -> index | |
%38 = "arith.constant"() {value = 23 : index} : () -> index | |
%39 = "arith.constant"() {value = 22 : index} : () -> index | |
%40 = "arith.constant"() {value = 21 : index} : () -> index | |
%41 = "arith.constant"() {value = 20 : index} : () -> index | |
%42 = "arith.constant"() {value = 19 : index} : () -> index | |
%43 = "arith.constant"() {value = 15 : index} : () -> index | |
%44 = "arith.constant"() {value = 14 : index} : () -> index | |
%45 = "arith.constant"() {value = 13 : index} : () -> index | |
%46 = "arith.constant"() {value = 12 : index} : () -> index | |
%47 = "arith.constant"() {value = 11 : index} : () -> index | |
%48 = "arith.constant"() {value = 10 : index} : () -> index | |
%49 = "arith.constant"() {value = 9 : index} : () -> index | |
%50 = "arith.constant"() {value = 7 : index} : () -> index | |
%51 = "arith.constant"() {value = 6 : index} : () -> index | |
%52 = "arith.constant"() {value = 5 : index} : () -> index | |
%53 = "arith.constant"() {value = 4 : index} : () -> index | |
%54 = "arith.constant"() {value = 3 : index} : () -> index | |
%55 = "arith.constant"() {value = 2 : index} : () -> index | |
%56 = "arith.constant"() {value = 1 : index} : () -> index | |
%57 = "arith.constant"() {value = 36 : index} : () -> index | |
%58 = "arith.constant"() {value = 272 : index} : () -> index | |
%59 = "arith.constant"() {value = 17 : index} : () -> index | |
%60 = "arith.constant"() {value = 18 : index} : () -> index | |
%61 = "arith.constant"() {value = 64 : index} : () -> index | |
%62 = "arith.constant"() {value = 1280 : index} : () -> index | |
%63 = "arith.constant"() {value = 1477120 : index} : () -> index | |
%64 = "arith.constant"() {value = 72 : index} : () -> index | |
%65 = "arith.constant"() {value = 8 : index} : () -> index | |
%66 = "arith.constant"() {value = 2560 : index} : () -> index | |
%67 = "arith.constant"() {value = 240 : index} : () -> index | |
%68 = "arith.constant"() {value = 160 : index} : () -> index | |
%69 = "arith.constant"() {value = 80 : index} : () -> index | |
%70 = "arith.constant"() {value = -1 : index} : () -> index | |
%71 = "arith.constant"() {value = 16 : index} : () -> index | |
%72 = "arith.constant"() {value = 320 : index} : () -> index | |
%73 = "arith.constant"() {value = 5120 : index} : () -> index | |
%74 = "arith.constant"() {value = 0 : index} : () -> index | |
%75 = "arith.constant"() {value = 1474560 : index} : () -> index | |
%76 = "arith.constant"() {value = 25600 : index} : () -> index | |
%77 = "arith.constant"() {value = 288 : index} : () -> index | |
%78 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
%79 = "arith.constant"() {value = 23592960 : index} : () -> index | |
%80 = "arith.constant"() {value = 32 : index} : () -> index | |
%81 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
%82 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
%83 = "gpu.thread_id"() {dimension = #gpu<dim z>} : () -> index | |
%84 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>> | |
%85 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>> | |
%86 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
%87 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
%88 = "arith.index_castui"(%86) : (i32) -> index | |
%89 = "arith.index_castui"(%87) : (i32) -> index | |
%90 = "hal.interface.binding.subspan"(%79, %75) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%91 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%92 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%93 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%94 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%95 = "hal.interface.binding.subspan"(%88, %76) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%96 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%97 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%98 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%99 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%100 = "hal.interface.binding.subspan"(%89, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%101 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%102 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%103 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%104 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%105 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%106 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%107 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%108 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%109 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%110 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%111 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%112 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
%113 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
%114 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
%115 = "arith.muli"(%114, %73) : (index, index) -> index | |
%116 = "arith.muli"(%82, %72) : (index, index) -> index | |
%117 = "arith.addi"(%115, %116) : (index, index) -> index | |
%118 = "arith.muli"(%113, %71) : (index, index) -> index | |
%119 = "arith.addi"(%117, %118) : (index, index) -> index | |
%120 = "arith.addi"(%119, %81) : (index, index) -> index | |
%121 = "arith.cmpi"(%89, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
%122 = "arith.subi"(%70, %89) : (index, index) -> index | |
%123 = "arith.select"(%121, %122, %89) : (i1, index, index) -> index | |
%124 = "arith.divsi"(%123, %71) : (index, index) -> index | |
%125 = "arith.subi"(%70, %124) : (index, index) -> index | |
%126 = "arith.select"(%121, %125, %124) : (i1, index, index) -> index | |
%127 = "arith.addi"(%120, %126) : (index, index) -> index | |
"memref.store"(%78, %101, %127) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
%128 = "arith.addi"(%127, %69) : (index, index) -> index | |
"memref.store"(%78, %102, %128) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
%129 = "arith.addi"(%127, %68) : (index, index) -> index | |
"memref.store"(%78, %103, %129) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
%130 = "arith.addi"(%127, %67) : (index, index) -> index | |
"memref.store"(%78, %104, %130) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
%131 = "memref.load"(%105, %127) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%132 = "memref.load"(%106, %128) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%133 = "memref.load"(%107, %129) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%134 = "memref.load"(%108, %130) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%135 = "arith.addi"(%115, %81) : (index, index) -> index | |
%136 = "arith.muli"(%82, %68) : (index, index) -> index | |
%137 = "arith.addi"(%135, %136) : (index, index) -> index | |
%138 = "arith.muli"(%83, %66) : (index, index) -> index | |
%139 = "arith.addi"(%137, %138) : (index, index) -> index | |
%140 = "arith.cmpi"(%81, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
%141 = "arith.subi"(%70, %81) : (index, index) -> index | |
%142 = "arith.select"(%140, %141, %81) : (i1, index, index) -> index | |
%143 = "arith.divsi"(%142, %65) : (index, index) -> index | |
%144 = "arith.subi"(%70, %143) : (index, index) -> index | |
%145 = "arith.select"(%140, %144, %143) : (i1, index, index) -> index | |
%146 = "arith.muli"(%145, %64) : (index, index) -> index | |
%147 = "arith.addi"(%139, %146) : (index, index) -> index | |
%148 = "arith.addi"(%147, %75) : (index, index) -> index | |
%149 = "memref.load"(%91, %148) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%150 = "arith.addi"(%147, %63) : (index, index) -> index | |
%151 = "memref.load"(%92, %150) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%152 = "arith.muli"(%82, %69) : (index, index) -> index | |
%153 = "arith.addi"(%81, %152) : (index, index) -> index | |
%154 = "arith.muli"(%83, %62) : (index, index) -> index | |
%155 = "arith.addi"(%153, %154) : (index, index) -> index | |
%156 = "arith.addi"(%155, %118) : (index, index) -> index | |
%157 = "arith.divsi"(%142, %71) : (index, index) -> index | |
%158 = "arith.subi"(%70, %157) : (index, index) -> index | |
%159 = "arith.select"(%140, %158, %157) : (i1, index, index) -> index | |
%160 = "arith.muli"(%159, %61) : (index, index) -> index | |
%161 = "arith.addi"(%156, %160) : (index, index) -> index | |
%162 = "arith.cmpi"(%88, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
%163 = "arith.subi"(%70, %88) : (index, index) -> index | |
%164 = "arith.select"(%162, %163, %88) : (i1, index, index) -> index | |
%165 = "arith.divsi"(%164, %71) : (index, index) -> index | |
%166 = "arith.subi"(%70, %165) : (index, index) -> index | |
%167 = "arith.select"(%162, %166, %165) : (i1, index, index) -> index | |
%168 = "arith.addi"(%161, %167) : (index, index) -> index | |
%169 = "memref.load"(%96, %168) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%170 = "arith.addi"(%168, %62) : (index, index) -> index | |
%171 = "memref.load"(%97, %170) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%172:8 = "scf.for"(%74, %77, %80, %131, %132, %133, %134, %149, %151, %169, %171) ({ | |
^bb0(%arg0: index, %arg1: vector<4xf32>, %arg2: vector<4xf32>, %arg3: vector<4xf32>, %arg4: vector<4xf32>, %arg5: vector<4xf32>, %arg6: vector<4xf32>, %arg7: vector<4xf32>, %arg8: vector<4xf32>): | |
"gpu.barrier"() : () -> () | |
%696 = "arith.muli"(%82, %60) : (index, index) -> index | |
%697 = "arith.addi"(%81, %696) : (index, index) -> index | |
%698 = "arith.muli"(%83, %77) : (index, index) -> index | |
%699 = "arith.addi"(%697, %698) : (index, index) -> index | |
%700 = "arith.addi"(%699, %145) : (index, index) -> index | |
"memref.store"(%arg5, %84, %700) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
%701 = "arith.addi"(%700, %77) : (index, index) -> index | |
"memref.store"(%arg6, %84, %701) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
%702 = "arith.muli"(%82, %59) : (index, index) -> index | |
%703 = "arith.addi"(%81, %702) : (index, index) -> index | |
%704 = "arith.muli"(%83, %58) : (index, index) -> index | |
%705 = "arith.addi"(%703, %704) : (index, index) -> index | |
%706 = "arith.addi"(%705, %159) : (index, index) -> index | |
"memref.store"(%arg7, %85, %706) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
%707 = "arith.addi"(%706, %58) : (index, index) -> index | |
"memref.store"(%arg8, %85, %707) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
"gpu.barrier"() : () -> () | |
%708 = "arith.muli"(%82, %57) : (index, index) -> index | |
%709 = "memref.load"(%84, %708) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%710 = "arith.addi"(%708, %56) : (index, index) -> index | |
%711 = "memref.load"(%84, %710) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%712 = "arith.addi"(%708, %55) : (index, index) -> index | |
%713 = "memref.load"(%84, %712) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%714 = "arith.addi"(%708, %54) : (index, index) -> index | |
%715 = "memref.load"(%84, %714) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%716 = "arith.addi"(%708, %53) : (index, index) -> index | |
%717 = "memref.load"(%84, %716) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%718 = "arith.addi"(%708, %52) : (index, index) -> index | |
%719 = "memref.load"(%84, %718) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%720 = "arith.addi"(%708, %51) : (index, index) -> index | |
%721 = "memref.load"(%84, %720) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%722 = "arith.addi"(%708, %50) : (index, index) -> index | |
%723 = "memref.load"(%84, %722) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%724 = "arith.addi"(%708, %49) : (index, index) -> index | |
%725 = "memref.load"(%84, %724) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%726 = "arith.addi"(%708, %48) : (index, index) -> index | |
%727 = "memref.load"(%84, %726) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%728 = "arith.addi"(%708, %47) : (index, index) -> index | |
%729 = "memref.load"(%84, %728) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%730 = "arith.addi"(%708, %46) : (index, index) -> index | |
%731 = "memref.load"(%84, %730) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%732 = "arith.addi"(%708, %45) : (index, index) -> index | |
%733 = "memref.load"(%84, %732) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%734 = "arith.addi"(%708, %44) : (index, index) -> index | |
%735 = "memref.load"(%84, %734) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%736 = "arith.addi"(%708, %43) : (index, index) -> index | |
%737 = "memref.load"(%84, %736) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%738 = "arith.addi"(%708, %71) : (index, index) -> index | |
%739 = "memref.load"(%84, %738) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%740 = "arith.addi"(%708, %60) : (index, index) -> index | |
%741 = "memref.load"(%84, %740) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%742 = "arith.addi"(%708, %42) : (index, index) -> index | |
%743 = "memref.load"(%84, %742) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%744 = "arith.addi"(%708, %41) : (index, index) -> index | |
%745 = "memref.load"(%84, %744) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%746 = "arith.addi"(%708, %40) : (index, index) -> index | |
%747 = "memref.load"(%84, %746) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%748 = "arith.addi"(%708, %39) : (index, index) -> index | |
%749 = "memref.load"(%84, %748) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%750 = "arith.addi"(%708, %38) : (index, index) -> index | |
%751 = "memref.load"(%84, %750) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%752 = "arith.addi"(%708, %37) : (index, index) -> index | |
%753 = "memref.load"(%84, %752) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%754 = "arith.addi"(%708, %36) : (index, index) -> index | |
%755 = "memref.load"(%84, %754) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%756 = "arith.addi"(%708, %35) : (index, index) -> index | |
%757 = "memref.load"(%84, %756) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%758 = "arith.addi"(%708, %34) : (index, index) -> index | |
%759 = "memref.load"(%84, %758) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%760 = "arith.addi"(%708, %33) : (index, index) -> index | |
%761 = "memref.load"(%84, %760) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%762 = "arith.addi"(%708, %32) : (index, index) -> index | |
%763 = "memref.load"(%84, %762) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%764 = "arith.addi"(%708, %31) : (index, index) -> index | |
%765 = "memref.load"(%84, %764) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%766 = "arith.addi"(%708, %80) : (index, index) -> index | |
%767 = "memref.load"(%84, %766) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%768 = "arith.addi"(%708, %30) : (index, index) -> index | |
%769 = "memref.load"(%84, %768) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%770 = "arith.addi"(%708, %29) : (index, index) -> index | |
%771 = "memref.load"(%84, %770) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%772 = "memref.load"(%85, %81) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%773 = "arith.addi"(%81, %59) : (index, index) -> index | |
%774 = "memref.load"(%85, %773) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%775 = "arith.addi"(%81, %29) : (index, index) -> index | |
%776 = "memref.load"(%85, %775) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%777 = "arith.addi"(%81, %28) : (index, index) -> index | |
%778 = "memref.load"(%85, %777) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%779 = "arith.addi"(%81, %27) : (index, index) -> index | |
%780 = "memref.load"(%85, %779) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%781 = "arith.addi"(%81, %26) : (index, index) -> index | |
%782 = "memref.load"(%85, %781) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%783 = "arith.addi"(%81, %25) : (index, index) -> index | |
%784 = "memref.load"(%85, %783) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%785 = "arith.addi"(%81, %24) : (index, index) -> index | |
%786 = "memref.load"(%85, %785) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%787 = "arith.addi"(%81, %23) : (index, index) -> index | |
%788 = "memref.load"(%85, %787) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%789 = "arith.addi"(%81, %22) : (index, index) -> index | |
%790 = "memref.load"(%85, %789) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%791 = "arith.addi"(%81, %21) : (index, index) -> index | |
%792 = "memref.load"(%85, %791) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%793 = "arith.addi"(%81, %20) : (index, index) -> index | |
%794 = "memref.load"(%85, %793) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%795 = "arith.addi"(%81, %19) : (index, index) -> index | |
%796 = "memref.load"(%85, %795) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%797 = "arith.addi"(%81, %18) : (index, index) -> index | |
%798 = "memref.load"(%85, %797) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%799 = "arith.addi"(%81, %17) : (index, index) -> index | |
%800 = "memref.load"(%85, %799) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%801 = "arith.addi"(%81, %16) : (index, index) -> index | |
%802 = "memref.load"(%85, %801) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%803 = "arith.addi"(%81, %58) : (index, index) -> index | |
%804 = "memref.load"(%85, %803) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%805 = "arith.addi"(%81, %15) : (index, index) -> index | |
%806 = "memref.load"(%85, %805) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%807 = "arith.addi"(%81, %14) : (index, index) -> index | |
%808 = "memref.load"(%85, %807) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%809 = "arith.addi"(%81, %13) : (index, index) -> index | |
%810 = "memref.load"(%85, %809) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%811 = "arith.addi"(%81, %12) : (index, index) -> index | |
%812 = "memref.load"(%85, %811) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%813 = "arith.addi"(%81, %11) : (index, index) -> index | |
%814 = "memref.load"(%85, %813) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%815 = "arith.addi"(%81, %10) : (index, index) -> index | |
%816 = "memref.load"(%85, %815) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%817 = "arith.addi"(%81, %9) : (index, index) -> index | |
%818 = "memref.load"(%85, %817) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%819 = "arith.addi"(%81, %8) : (index, index) -> index | |
%820 = "memref.load"(%85, %819) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%821 = "arith.addi"(%81, %7) : (index, index) -> index | |
%822 = "memref.load"(%85, %821) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%823 = "arith.addi"(%81, %6) : (index, index) -> index | |
%824 = "memref.load"(%85, %823) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%825 = "arith.addi"(%81, %5) : (index, index) -> index | |
%826 = "memref.load"(%85, %825) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%827 = "arith.addi"(%81, %4) : (index, index) -> index | |
%828 = "memref.load"(%85, %827) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%829 = "arith.addi"(%81, %3) : (index, index) -> index | |
%830 = "memref.load"(%85, %829) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%831 = "arith.addi"(%81, %2) : (index, index) -> index | |
%832 = "memref.load"(%85, %831) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%833 = "arith.addi"(%81, %1) : (index, index) -> index | |
%834 = "memref.load"(%85, %833) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%835 = "vector.extract"(%709) {position = [0]} : (vector<4xf32>) -> f32 | |
%836 = "vector.splat"(%835) : (f32) -> vector<4xf32> | |
%837 = "vector.fma"(%836, %772, %arg1) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%838 = "vector.extract"(%709) {position = [1]} : (vector<4xf32>) -> f32 | |
%839 = "vector.splat"(%838) : (f32) -> vector<4xf32> | |
%840 = "vector.fma"(%839, %774, %837) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%841 = "vector.extract"(%709) {position = [2]} : (vector<4xf32>) -> f32 | |
%842 = "vector.splat"(%841) : (f32) -> vector<4xf32> | |
%843 = "vector.fma"(%842, %776, %840) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%844 = "vector.extract"(%709) {position = [3]} : (vector<4xf32>) -> f32 | |
%845 = "vector.splat"(%844) : (f32) -> vector<4xf32> | |
%846 = "vector.fma"(%845, %778, %843) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%847 = "vector.extract"(%711) {position = [0]} : (vector<4xf32>) -> f32 | |
%848 = "vector.splat"(%847) : (f32) -> vector<4xf32> | |
%849 = "vector.fma"(%848, %780, %846) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%850 = "vector.extract"(%711) {position = [1]} : (vector<4xf32>) -> f32 | |
%851 = "vector.splat"(%850) : (f32) -> vector<4xf32> | |
%852 = "vector.fma"(%851, %782, %849) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%853 = "vector.extract"(%711) {position = [2]} : (vector<4xf32>) -> f32 | |
%854 = "vector.splat"(%853) : (f32) -> vector<4xf32> | |
%855 = "vector.fma"(%854, %784, %852) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%856 = "vector.extract"(%711) {position = [3]} : (vector<4xf32>) -> f32 | |
%857 = "vector.splat"(%856) : (f32) -> vector<4xf32> | |
%858 = "vector.fma"(%857, %786, %855) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%859 = "vector.extract"(%713) {position = [0]} : (vector<4xf32>) -> f32 | |
%860 = "vector.splat"(%859) : (f32) -> vector<4xf32> | |
%861 = "vector.fma"(%860, %788, %858) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%862 = "vector.extract"(%713) {position = [1]} : (vector<4xf32>) -> f32 | |
%863 = "vector.splat"(%862) : (f32) -> vector<4xf32> | |
%864 = "vector.fma"(%863, %790, %861) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%865 = "vector.extract"(%713) {position = [2]} : (vector<4xf32>) -> f32 | |
%866 = "vector.splat"(%865) : (f32) -> vector<4xf32> | |
%867 = "vector.fma"(%866, %792, %864) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%868 = "vector.extract"(%713) {position = [3]} : (vector<4xf32>) -> f32 | |
%869 = "vector.splat"(%868) : (f32) -> vector<4xf32> | |
%870 = "vector.fma"(%869, %794, %867) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%871 = "vector.extract"(%715) {position = [0]} : (vector<4xf32>) -> f32 | |
%872 = "vector.splat"(%871) : (f32) -> vector<4xf32> | |
%873 = "vector.fma"(%872, %796, %870) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%874 = "vector.extract"(%715) {position = [1]} : (vector<4xf32>) -> f32 | |
%875 = "vector.splat"(%874) : (f32) -> vector<4xf32> | |
%876 = "vector.fma"(%875, %798, %873) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%877 = "vector.extract"(%715) {position = [2]} : (vector<4xf32>) -> f32 | |
%878 = "vector.splat"(%877) : (f32) -> vector<4xf32> | |
%879 = "vector.fma"(%878, %800, %876) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%880 = "vector.extract"(%715) {position = [3]} : (vector<4xf32>) -> f32 | |
%881 = "vector.splat"(%880) : (f32) -> vector<4xf32> | |
%882 = "vector.fma"(%881, %802, %879) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%883 = "vector.extract"(%717) {position = [0]} : (vector<4xf32>) -> f32 | |
%884 = "vector.splat"(%883) : (f32) -> vector<4xf32> | |
%885 = "vector.fma"(%884, %804, %882) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%886 = "vector.extract"(%717) {position = [1]} : (vector<4xf32>) -> f32 | |
%887 = "vector.splat"(%886) : (f32) -> vector<4xf32> | |
%888 = "vector.fma"(%887, %806, %885) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%889 = "vector.extract"(%717) {position = [2]} : (vector<4xf32>) -> f32 | |
%890 = "vector.splat"(%889) : (f32) -> vector<4xf32> | |
%891 = "vector.fma"(%890, %808, %888) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%892 = "vector.extract"(%717) {position = [3]} : (vector<4xf32>) -> f32 | |
%893 = "vector.splat"(%892) : (f32) -> vector<4xf32> | |
%894 = "vector.fma"(%893, %810, %891) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%895 = "vector.extract"(%719) {position = [0]} : (vector<4xf32>) -> f32 | |
%896 = "vector.splat"(%895) : (f32) -> vector<4xf32> | |
%897 = "vector.fma"(%896, %812, %894) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%898 = "vector.extract"(%719) {position = [1]} : (vector<4xf32>) -> f32 | |
%899 = "vector.splat"(%898) : (f32) -> vector<4xf32> | |
%900 = "vector.fma"(%899, %814, %897) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%901 = "vector.extract"(%719) {position = [2]} : (vector<4xf32>) -> f32 | |
%902 = "vector.splat"(%901) : (f32) -> vector<4xf32> | |
%903 = "vector.fma"(%902, %816, %900) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%904 = "vector.extract"(%719) {position = [3]} : (vector<4xf32>) -> f32 | |
%905 = "vector.splat"(%904) : (f32) -> vector<4xf32> | |
%906 = "vector.fma"(%905, %818, %903) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%907 = "vector.extract"(%721) {position = [0]} : (vector<4xf32>) -> f32 | |
%908 = "vector.splat"(%907) : (f32) -> vector<4xf32> | |
%909 = "vector.fma"(%908, %820, %906) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%910 = "vector.extract"(%721) {position = [1]} : (vector<4xf32>) -> f32 | |
%911 = "vector.splat"(%910) : (f32) -> vector<4xf32> | |
%912 = "vector.fma"(%911, %822, %909) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%913 = "vector.extract"(%721) {position = [2]} : (vector<4xf32>) -> f32 | |
%914 = "vector.splat"(%913) : (f32) -> vector<4xf32> | |
%915 = "vector.fma"(%914, %824, %912) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%916 = "vector.extract"(%721) {position = [3]} : (vector<4xf32>) -> f32 | |
%917 = "vector.splat"(%916) : (f32) -> vector<4xf32> | |
%918 = "vector.fma"(%917, %826, %915) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%919 = "vector.extract"(%723) {position = [0]} : (vector<4xf32>) -> f32 | |
%920 = "vector.splat"(%919) : (f32) -> vector<4xf32> | |
%921 = "vector.fma"(%920, %828, %918) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%922 = "vector.extract"(%723) {position = [1]} : (vector<4xf32>) -> f32 | |
%923 = "vector.splat"(%922) : (f32) -> vector<4xf32> | |
%924 = "vector.fma"(%923, %830, %921) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%925 = "vector.extract"(%723) {position = [2]} : (vector<4xf32>) -> f32 | |
%926 = "vector.splat"(%925) : (f32) -> vector<4xf32> | |
%927 = "vector.fma"(%926, %832, %924) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%928 = "vector.extract"(%723) {position = [3]} : (vector<4xf32>) -> f32 | |
%929 = "vector.splat"(%928) : (f32) -> vector<4xf32> | |
%930 = "vector.fma"(%929, %834, %927) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%931 = "vector.extract"(%725) {position = [0]} : (vector<4xf32>) -> f32 | |
%932 = "vector.splat"(%931) : (f32) -> vector<4xf32> | |
%933 = "vector.fma"(%932, %772, %arg2) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%934 = "vector.extract"(%725) {position = [1]} : (vector<4xf32>) -> f32 | |
%935 = "vector.splat"(%934) : (f32) -> vector<4xf32> | |
%936 = "vector.fma"(%935, %774, %933) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%937 = "vector.extract"(%725) {position = [2]} : (vector<4xf32>) -> f32 | |
%938 = "vector.splat"(%937) : (f32) -> vector<4xf32> | |
%939 = "vector.fma"(%938, %776, %936) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%940 = "vector.extract"(%725) {position = [3]} : (vector<4xf32>) -> f32 | |
%941 = "vector.splat"(%940) : (f32) -> vector<4xf32> | |
%942 = "vector.fma"(%941, %778, %939) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%943 = "vector.extract"(%727) {position = [0]} : (vector<4xf32>) -> f32 | |
%944 = "vector.splat"(%943) : (f32) -> vector<4xf32> | |
%945 = "vector.fma"(%944, %780, %942) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%946 = "vector.extract"(%727) {position = [1]} : (vector<4xf32>) -> f32 | |
%947 = "vector.splat"(%946) : (f32) -> vector<4xf32> | |
%948 = "vector.fma"(%947, %782, %945) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%949 = "vector.extract"(%727) {position = [2]} : (vector<4xf32>) -> f32 | |
%950 = "vector.splat"(%949) : (f32) -> vector<4xf32> | |
%951 = "vector.fma"(%950, %784, %948) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%952 = "vector.extract"(%727) {position = [3]} : (vector<4xf32>) -> f32 | |
%953 = "vector.splat"(%952) : (f32) -> vector<4xf32> | |
%954 = "vector.fma"(%953, %786, %951) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%955 = "vector.extract"(%729) {position = [0]} : (vector<4xf32>) -> f32 | |
%956 = "vector.splat"(%955) : (f32) -> vector<4xf32> | |
%957 = "vector.fma"(%956, %788, %954) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%958 = "vector.extract"(%729) {position = [1]} : (vector<4xf32>) -> f32 | |
%959 = "vector.splat"(%958) : (f32) -> vector<4xf32> | |
%960 = "vector.fma"(%959, %790, %957) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%961 = "vector.extract"(%729) {position = [2]} : (vector<4xf32>) -> f32 | |
%962 = "vector.splat"(%961) : (f32) -> vector<4xf32> | |
%963 = "vector.fma"(%962, %792, %960) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%964 = "vector.extract"(%729) {position = [3]} : (vector<4xf32>) -> f32 | |
%965 = "vector.splat"(%964) : (f32) -> vector<4xf32> | |
%966 = "vector.fma"(%965, %794, %963) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%967 = "vector.extract"(%731) {position = [0]} : (vector<4xf32>) -> f32 | |
%968 = "vector.splat"(%967) : (f32) -> vector<4xf32> | |
%969 = "vector.fma"(%968, %796, %966) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%970 = "vector.extract"(%731) {position = [1]} : (vector<4xf32>) -> f32 | |
%971 = "vector.splat"(%970) : (f32) -> vector<4xf32> | |
%972 = "vector.fma"(%971, %798, %969) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%973 = "vector.extract"(%731) {position = [2]} : (vector<4xf32>) -> f32 | |
%974 = "vector.splat"(%973) : (f32) -> vector<4xf32> | |
%975 = "vector.fma"(%974, %800, %972) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%976 = "vector.extract"(%731) {position = [3]} : (vector<4xf32>) -> f32 | |
%977 = "vector.splat"(%976) : (f32) -> vector<4xf32> | |
%978 = "vector.fma"(%977, %802, %975) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%979 = "vector.extract"(%733) {position = [0]} : (vector<4xf32>) -> f32 | |
%980 = "vector.splat"(%979) : (f32) -> vector<4xf32> | |
%981 = "vector.fma"(%980, %804, %978) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%982 = "vector.extract"(%733) {position = [1]} : (vector<4xf32>) -> f32 | |
%983 = "vector.splat"(%982) : (f32) -> vector<4xf32> | |
%984 = "vector.fma"(%983, %806, %981) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%985 = "vector.extract"(%733) {position = [2]} : (vector<4xf32>) -> f32 | |
%986 = "vector.splat"(%985) : (f32) -> vector<4xf32> | |
%987 = "vector.fma"(%986, %808, %984) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%988 = "vector.extract"(%733) {position = [3]} : (vector<4xf32>) -> f32 | |
%989 = "vector.splat"(%988) : (f32) -> vector<4xf32> | |
%990 = "vector.fma"(%989, %810, %987) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%991 = "vector.extract"(%735) {position = [0]} : (vector<4xf32>) -> f32 | |
%992 = "vector.splat"(%991) : (f32) -> vector<4xf32> | |
%993 = "vector.fma"(%992, %812, %990) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%994 = "vector.extract"(%735) {position = [1]} : (vector<4xf32>) -> f32 | |
%995 = "vector.splat"(%994) : (f32) -> vector<4xf32> | |
%996 = "vector.fma"(%995, %814, %993) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%997 = "vector.extract"(%735) {position = [2]} : (vector<4xf32>) -> f32 | |
%998 = "vector.splat"(%997) : (f32) -> vector<4xf32> | |
%999 = "vector.fma"(%998, %816, %996) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1000 = "vector.extract"(%735) {position = [3]} : (vector<4xf32>) -> f32 | |
%1001 = "vector.splat"(%1000) : (f32) -> vector<4xf32> | |
%1002 = "vector.fma"(%1001, %818, %999) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1003 = "vector.extract"(%737) {position = [0]} : (vector<4xf32>) -> f32 | |
%1004 = "vector.splat"(%1003) : (f32) -> vector<4xf32> | |
%1005 = "vector.fma"(%1004, %820, %1002) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1006 = "vector.extract"(%737) {position = [1]} : (vector<4xf32>) -> f32 | |
%1007 = "vector.splat"(%1006) : (f32) -> vector<4xf32> | |
%1008 = "vector.fma"(%1007, %822, %1005) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1009 = "vector.extract"(%737) {position = [2]} : (vector<4xf32>) -> f32 | |
%1010 = "vector.splat"(%1009) : (f32) -> vector<4xf32> | |
%1011 = "vector.fma"(%1010, %824, %1008) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1012 = "vector.extract"(%737) {position = [3]} : (vector<4xf32>) -> f32 | |
%1013 = "vector.splat"(%1012) : (f32) -> vector<4xf32> | |
%1014 = "vector.fma"(%1013, %826, %1011) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1015 = "vector.extract"(%739) {position = [0]} : (vector<4xf32>) -> f32 | |
%1016 = "vector.splat"(%1015) : (f32) -> vector<4xf32> | |
%1017 = "vector.fma"(%1016, %828, %1014) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1018 = "vector.extract"(%739) {position = [1]} : (vector<4xf32>) -> f32 | |
%1019 = "vector.splat"(%1018) : (f32) -> vector<4xf32> | |
%1020 = "vector.fma"(%1019, %830, %1017) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1021 = "vector.extract"(%739) {position = [2]} : (vector<4xf32>) -> f32 | |
%1022 = "vector.splat"(%1021) : (f32) -> vector<4xf32> | |
%1023 = "vector.fma"(%1022, %832, %1020) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1024 = "vector.extract"(%739) {position = [3]} : (vector<4xf32>) -> f32 | |
%1025 = "vector.splat"(%1024) : (f32) -> vector<4xf32> | |
%1026 = "vector.fma"(%1025, %834, %1023) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1027 = "vector.extract"(%741) {position = [0]} : (vector<4xf32>) -> f32 | |
%1028 = "vector.splat"(%1027) : (f32) -> vector<4xf32> | |
%1029 = "vector.fma"(%1028, %772, %arg3) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1030 = "vector.extract"(%741) {position = [1]} : (vector<4xf32>) -> f32 | |
%1031 = "vector.splat"(%1030) : (f32) -> vector<4xf32> | |
%1032 = "vector.fma"(%1031, %774, %1029) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1033 = "vector.extract"(%741) {position = [2]} : (vector<4xf32>) -> f32 | |
%1034 = "vector.splat"(%1033) : (f32) -> vector<4xf32> | |
%1035 = "vector.fma"(%1034, %776, %1032) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1036 = "vector.extract"(%741) {position = [3]} : (vector<4xf32>) -> f32 | |
%1037 = "vector.splat"(%1036) : (f32) -> vector<4xf32> | |
%1038 = "vector.fma"(%1037, %778, %1035) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1039 = "vector.extract"(%743) {position = [0]} : (vector<4xf32>) -> f32 | |
%1040 = "vector.splat"(%1039) : (f32) -> vector<4xf32> | |
%1041 = "vector.fma"(%1040, %780, %1038) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1042 = "vector.extract"(%743) {position = [1]} : (vector<4xf32>) -> f32 | |
%1043 = "vector.splat"(%1042) : (f32) -> vector<4xf32> | |
%1044 = "vector.fma"(%1043, %782, %1041) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1045 = "vector.extract"(%743) {position = [2]} : (vector<4xf32>) -> f32 | |
%1046 = "vector.splat"(%1045) : (f32) -> vector<4xf32> | |
%1047 = "vector.fma"(%1046, %784, %1044) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1048 = "vector.extract"(%743) {position = [3]} : (vector<4xf32>) -> f32 | |
%1049 = "vector.splat"(%1048) : (f32) -> vector<4xf32> | |
%1050 = "vector.fma"(%1049, %786, %1047) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1051 = "vector.extract"(%745) {position = [0]} : (vector<4xf32>) -> f32 | |
%1052 = "vector.splat"(%1051) : (f32) -> vector<4xf32> | |
%1053 = "vector.fma"(%1052, %788, %1050) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1054 = "vector.extract"(%745) {position = [1]} : (vector<4xf32>) -> f32 | |
%1055 = "vector.splat"(%1054) : (f32) -> vector<4xf32> | |
%1056 = "vector.fma"(%1055, %790, %1053) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1057 = "vector.extract"(%745) {position = [2]} : (vector<4xf32>) -> f32 | |
%1058 = "vector.splat"(%1057) : (f32) -> vector<4xf32> | |
%1059 = "vector.fma"(%1058, %792, %1056) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1060 = "vector.extract"(%745) {position = [3]} : (vector<4xf32>) -> f32 | |
%1061 = "vector.splat"(%1060) : (f32) -> vector<4xf32> | |
%1062 = "vector.fma"(%1061, %794, %1059) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1063 = "vector.extract"(%747) {position = [0]} : (vector<4xf32>) -> f32 | |
%1064 = "vector.splat"(%1063) : (f32) -> vector<4xf32> | |
%1065 = "vector.fma"(%1064, %796, %1062) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1066 = "vector.extract"(%747) {position = [1]} : (vector<4xf32>) -> f32 | |
%1067 = "vector.splat"(%1066) : (f32) -> vector<4xf32> | |
%1068 = "vector.fma"(%1067, %798, %1065) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1069 = "vector.extract"(%747) {position = [2]} : (vector<4xf32>) -> f32 | |
%1070 = "vector.splat"(%1069) : (f32) -> vector<4xf32> | |
%1071 = "vector.fma"(%1070, %800, %1068) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1072 = "vector.extract"(%747) {position = [3]} : (vector<4xf32>) -> f32 | |
%1073 = "vector.splat"(%1072) : (f32) -> vector<4xf32> | |
%1074 = "vector.fma"(%1073, %802, %1071) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1075 = "vector.extract"(%749) {position = [0]} : (vector<4xf32>) -> f32 | |
%1076 = "vector.splat"(%1075) : (f32) -> vector<4xf32> | |
%1077 = "vector.fma"(%1076, %804, %1074) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1078 = "vector.extract"(%749) {position = [1]} : (vector<4xf32>) -> f32 | |
%1079 = "vector.splat"(%1078) : (f32) -> vector<4xf32> | |
%1080 = "vector.fma"(%1079, %806, %1077) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1081 = "vector.extract"(%749) {position = [2]} : (vector<4xf32>) -> f32 | |
%1082 = "vector.splat"(%1081) : (f32) -> vector<4xf32> | |
%1083 = "vector.fma"(%1082, %808, %1080) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1084 = "vector.extract"(%749) {position = [3]} : (vector<4xf32>) -> f32 | |
%1085 = "vector.splat"(%1084) : (f32) -> vector<4xf32> | |
%1086 = "vector.fma"(%1085, %810, %1083) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1087 = "vector.extract"(%751) {position = [0]} : (vector<4xf32>) -> f32 | |
%1088 = "vector.splat"(%1087) : (f32) -> vector<4xf32> | |
%1089 = "vector.fma"(%1088, %812, %1086) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1090 = "vector.extract"(%751) {position = [1]} : (vector<4xf32>) -> f32 | |
%1091 = "vector.splat"(%1090) : (f32) -> vector<4xf32> | |
%1092 = "vector.fma"(%1091, %814, %1089) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1093 = "vector.extract"(%751) {position = [2]} : (vector<4xf32>) -> f32 | |
%1094 = "vector.splat"(%1093) : (f32) -> vector<4xf32> | |
%1095 = "vector.fma"(%1094, %816, %1092) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1096 = "vector.extract"(%751) {position = [3]} : (vector<4xf32>) -> f32 | |
%1097 = "vector.splat"(%1096) : (f32) -> vector<4xf32> | |
%1098 = "vector.fma"(%1097, %818, %1095) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1099 = "vector.extract"(%753) {position = [0]} : (vector<4xf32>) -> f32 | |
%1100 = "vector.splat"(%1099) : (f32) -> vector<4xf32> | |
%1101 = "vector.fma"(%1100, %820, %1098) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1102 = "vector.extract"(%753) {position = [1]} : (vector<4xf32>) -> f32 | |
%1103 = "vector.splat"(%1102) : (f32) -> vector<4xf32> | |
%1104 = "vector.fma"(%1103, %822, %1101) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1105 = "vector.extract"(%753) {position = [2]} : (vector<4xf32>) -> f32 | |
%1106 = "vector.splat"(%1105) : (f32) -> vector<4xf32> | |
%1107 = "vector.fma"(%1106, %824, %1104) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1108 = "vector.extract"(%753) {position = [3]} : (vector<4xf32>) -> f32 | |
%1109 = "vector.splat"(%1108) : (f32) -> vector<4xf32> | |
%1110 = "vector.fma"(%1109, %826, %1107) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1111 = "vector.extract"(%755) {position = [0]} : (vector<4xf32>) -> f32 | |
%1112 = "vector.splat"(%1111) : (f32) -> vector<4xf32> | |
%1113 = "vector.fma"(%1112, %828, %1110) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1114 = "vector.extract"(%755) {position = [1]} : (vector<4xf32>) -> f32 | |
%1115 = "vector.splat"(%1114) : (f32) -> vector<4xf32> | |
%1116 = "vector.fma"(%1115, %830, %1113) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1117 = "vector.extract"(%755) {position = [2]} : (vector<4xf32>) -> f32 | |
%1118 = "vector.splat"(%1117) : (f32) -> vector<4xf32> | |
%1119 = "vector.fma"(%1118, %832, %1116) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1120 = "vector.extract"(%755) {position = [3]} : (vector<4xf32>) -> f32 | |
%1121 = "vector.splat"(%1120) : (f32) -> vector<4xf32> | |
%1122 = "vector.fma"(%1121, %834, %1119) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1123 = "vector.extract"(%757) {position = [0]} : (vector<4xf32>) -> f32 | |
%1124 = "vector.splat"(%1123) : (f32) -> vector<4xf32> | |
%1125 = "vector.fma"(%1124, %772, %arg4) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1126 = "vector.extract"(%757) {position = [1]} : (vector<4xf32>) -> f32 | |
%1127 = "vector.splat"(%1126) : (f32) -> vector<4xf32> | |
%1128 = "vector.fma"(%1127, %774, %1125) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1129 = "vector.extract"(%757) {position = [2]} : (vector<4xf32>) -> f32 | |
%1130 = "vector.splat"(%1129) : (f32) -> vector<4xf32> | |
%1131 = "vector.fma"(%1130, %776, %1128) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1132 = "vector.extract"(%757) {position = [3]} : (vector<4xf32>) -> f32 | |
%1133 = "vector.splat"(%1132) : (f32) -> vector<4xf32> | |
%1134 = "vector.fma"(%1133, %778, %1131) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1135 = "vector.extract"(%759) {position = [0]} : (vector<4xf32>) -> f32 | |
%1136 = "vector.splat"(%1135) : (f32) -> vector<4xf32> | |
%1137 = "vector.fma"(%1136, %780, %1134) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1138 = "vector.extract"(%759) {position = [1]} : (vector<4xf32>) -> f32 | |
%1139 = "vector.splat"(%1138) : (f32) -> vector<4xf32> | |
%1140 = "vector.fma"(%1139, %782, %1137) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1141 = "vector.extract"(%759) {position = [2]} : (vector<4xf32>) -> f32 | |
%1142 = "vector.splat"(%1141) : (f32) -> vector<4xf32> | |
%1143 = "vector.fma"(%1142, %784, %1140) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1144 = "vector.extract"(%759) {position = [3]} : (vector<4xf32>) -> f32 | |
%1145 = "vector.splat"(%1144) : (f32) -> vector<4xf32> | |
%1146 = "vector.fma"(%1145, %786, %1143) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1147 = "vector.extract"(%761) {position = [0]} : (vector<4xf32>) -> f32 | |
%1148 = "vector.splat"(%1147) : (f32) -> vector<4xf32> | |
%1149 = "vector.fma"(%1148, %788, %1146) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1150 = "vector.extract"(%761) {position = [1]} : (vector<4xf32>) -> f32 | |
%1151 = "vector.splat"(%1150) : (f32) -> vector<4xf32> | |
%1152 = "vector.fma"(%1151, %790, %1149) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1153 = "vector.extract"(%761) {position = [2]} : (vector<4xf32>) -> f32 | |
%1154 = "vector.splat"(%1153) : (f32) -> vector<4xf32> | |
%1155 = "vector.fma"(%1154, %792, %1152) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1156 = "vector.extract"(%761) {position = [3]} : (vector<4xf32>) -> f32 | |
%1157 = "vector.splat"(%1156) : (f32) -> vector<4xf32> | |
%1158 = "vector.fma"(%1157, %794, %1155) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1159 = "vector.extract"(%763) {position = [0]} : (vector<4xf32>) -> f32 | |
%1160 = "vector.splat"(%1159) : (f32) -> vector<4xf32> | |
%1161 = "vector.fma"(%1160, %796, %1158) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1162 = "vector.extract"(%763) {position = [1]} : (vector<4xf32>) -> f32 | |
%1163 = "vector.splat"(%1162) : (f32) -> vector<4xf32> | |
%1164 = "vector.fma"(%1163, %798, %1161) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1165 = "vector.extract"(%763) {position = [2]} : (vector<4xf32>) -> f32 | |
%1166 = "vector.splat"(%1165) : (f32) -> vector<4xf32> | |
%1167 = "vector.fma"(%1166, %800, %1164) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1168 = "vector.extract"(%763) {position = [3]} : (vector<4xf32>) -> f32 | |
%1169 = "vector.splat"(%1168) : (f32) -> vector<4xf32> | |
%1170 = "vector.fma"(%1169, %802, %1167) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1171 = "vector.extract"(%765) {position = [0]} : (vector<4xf32>) -> f32 | |
%1172 = "vector.splat"(%1171) : (f32) -> vector<4xf32> | |
%1173 = "vector.fma"(%1172, %804, %1170) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1174 = "vector.extract"(%765) {position = [1]} : (vector<4xf32>) -> f32 | |
%1175 = "vector.splat"(%1174) : (f32) -> vector<4xf32> | |
%1176 = "vector.fma"(%1175, %806, %1173) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1177 = "vector.extract"(%765) {position = [2]} : (vector<4xf32>) -> f32 | |
%1178 = "vector.splat"(%1177) : (f32) -> vector<4xf32> | |
%1179 = "vector.fma"(%1178, %808, %1176) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1180 = "vector.extract"(%765) {position = [3]} : (vector<4xf32>) -> f32 | |
%1181 = "vector.splat"(%1180) : (f32) -> vector<4xf32> | |
%1182 = "vector.fma"(%1181, %810, %1179) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1183 = "vector.extract"(%767) {position = [0]} : (vector<4xf32>) -> f32 | |
%1184 = "vector.splat"(%1183) : (f32) -> vector<4xf32> | |
%1185 = "vector.fma"(%1184, %812, %1182) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1186 = "vector.extract"(%767) {position = [1]} : (vector<4xf32>) -> f32 | |
%1187 = "vector.splat"(%1186) : (f32) -> vector<4xf32> | |
%1188 = "vector.fma"(%1187, %814, %1185) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1189 = "vector.extract"(%767) {position = [2]} : (vector<4xf32>) -> f32 | |
%1190 = "vector.splat"(%1189) : (f32) -> vector<4xf32> | |
%1191 = "vector.fma"(%1190, %816, %1188) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1192 = "vector.extract"(%767) {position = [3]} : (vector<4xf32>) -> f32 | |
%1193 = "vector.splat"(%1192) : (f32) -> vector<4xf32> | |
%1194 = "vector.fma"(%1193, %818, %1191) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1195 = "vector.extract"(%769) {position = [0]} : (vector<4xf32>) -> f32 | |
%1196 = "vector.splat"(%1195) : (f32) -> vector<4xf32> | |
%1197 = "vector.fma"(%1196, %820, %1194) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1198 = "vector.extract"(%769) {position = [1]} : (vector<4xf32>) -> f32 | |
%1199 = "vector.splat"(%1198) : (f32) -> vector<4xf32> | |
%1200 = "vector.fma"(%1199, %822, %1197) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1201 = "vector.extract"(%769) {position = [2]} : (vector<4xf32>) -> f32 | |
%1202 = "vector.splat"(%1201) : (f32) -> vector<4xf32> | |
%1203 = "vector.fma"(%1202, %824, %1200) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1204 = "vector.extract"(%769) {position = [3]} : (vector<4xf32>) -> f32 | |
%1205 = "vector.splat"(%1204) : (f32) -> vector<4xf32> | |
%1206 = "vector.fma"(%1205, %826, %1203) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1207 = "vector.extract"(%771) {position = [0]} : (vector<4xf32>) -> f32 | |
%1208 = "vector.splat"(%1207) : (f32) -> vector<4xf32> | |
%1209 = "vector.fma"(%1208, %828, %1206) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1210 = "vector.extract"(%771) {position = [1]} : (vector<4xf32>) -> f32 | |
%1211 = "vector.splat"(%1210) : (f32) -> vector<4xf32> | |
%1212 = "vector.fma"(%1211, %830, %1209) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1213 = "vector.extract"(%771) {position = [2]} : (vector<4xf32>) -> f32 | |
%1214 = "vector.splat"(%1213) : (f32) -> vector<4xf32> | |
%1215 = "vector.fma"(%1214, %832, %1212) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1216 = "vector.extract"(%771) {position = [3]} : (vector<4xf32>) -> f32 | |
%1217 = "vector.splat"(%1216) : (f32) -> vector<4xf32> | |
%1218 = "vector.fma"(%1217, %834, %1215) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%1219 = "arith.addi"(%arg0, %80) : (index, index) -> index | |
%1220 = "arith.cmpi"(%1219, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
%1221 = "arith.subi"(%0, %arg0) : (index, index) -> index | |
%1222 = "arith.select"(%1220, %1221, %1219) : (i1, index, index) -> index | |
%1223 = "arith.divsi"(%1222, %53) : (index, index) -> index | |
%1224 = "arith.subi"(%70, %1223) : (index, index) -> index | |
%1225 = "arith.select"(%1220, %1224, %1223) : (i1, index, index) -> index | |
%1226 = "arith.addi"(%139, %1225) : (index, index) -> index | |
%1227 = "arith.addi"(%1226, %146) : (index, index) -> index | |
%1228 = "arith.addi"(%1227, %75) : (index, index) -> index | |
%1229 = "memref.load"(%93, %1228) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%1230 = "arith.addi"(%1227, %63) : (index, index) -> index | |
%1231 = "memref.load"(%94, %1230) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%1232 = "arith.muli"(%1219, %69) : (index, index) -> index | |
%1233 = "arith.addi"(%1232, %81) : (index, index) -> index | |
%1234 = "arith.addi"(%1233, %152) : (index, index) -> index | |
%1235 = "arith.addi"(%1234, %154) : (index, index) -> index | |
%1236 = "arith.addi"(%1235, %118) : (index, index) -> index | |
%1237 = "arith.addi"(%1236, %160) : (index, index) -> index | |
%1238 = "arith.addi"(%1237, %167) : (index, index) -> index | |
%1239 = "memref.load"(%98, %1238) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
%1240 = "arith.addi"(%1238, %62) : (index, index) -> index | |
%1241 = "memref.load"(%99, %1240) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
"scf.yield"(%930, %1026, %1122, %1218, %1229, %1231, %1239, %1241) : (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> () | |
}) : (index, index, index, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) | |
"gpu.barrier"() : () -> () | |
%173 = "arith.muli"(%82, %60) : (index, index) -> index | |
%174 = "arith.addi"(%81, %173) : (index, index) -> index | |
%175 = "arith.muli"(%83, %77) : (index, index) -> index | |
%176 = "arith.addi"(%174, %175) : (index, index) -> index | |
%177 = "arith.addi"(%176, %145) : (index, index) -> index | |
"memref.store"(%172#4, %84, %177) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
%178 = "arith.addi"(%177, %77) : (index, index) -> index | |
"memref.store"(%172#5, %84, %178) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
%179 = "arith.muli"(%82, %59) : (index, index) -> index | |
%180 = "arith.addi"(%81, %179) : (index, index) -> index | |
%181 = "arith.muli"(%83, %58) : (index, index) -> index | |
%182 = "arith.addi"(%180, %181) : (index, index) -> index | |
%183 = "arith.addi"(%182, %159) : (index, index) -> index | |
"memref.store"(%172#6, %85, %183) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
%184 = "arith.addi"(%183, %58) : (index, index) -> index | |
"memref.store"(%172#7, %85, %184) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
"gpu.barrier"() : () -> () | |
%185 = "arith.muli"(%82, %57) : (index, index) -> index | |
%186 = "memref.load"(%84, %185) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%187 = "arith.addi"(%185, %56) : (index, index) -> index | |
%188 = "memref.load"(%84, %187) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%189 = "arith.addi"(%185, %55) : (index, index) -> index | |
%190 = "memref.load"(%84, %189) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%191 = "arith.addi"(%185, %54) : (index, index) -> index | |
%192 = "memref.load"(%84, %191) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%193 = "arith.addi"(%185, %53) : (index, index) -> index | |
%194 = "memref.load"(%84, %193) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%195 = "arith.addi"(%185, %52) : (index, index) -> index | |
%196 = "memref.load"(%84, %195) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%197 = "arith.addi"(%185, %51) : (index, index) -> index | |
%198 = "memref.load"(%84, %197) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%199 = "arith.addi"(%185, %50) : (index, index) -> index | |
%200 = "memref.load"(%84, %199) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%201 = "arith.addi"(%185, %49) : (index, index) -> index | |
%202 = "memref.load"(%84, %201) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%203 = "arith.addi"(%185, %48) : (index, index) -> index | |
%204 = "memref.load"(%84, %203) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%205 = "arith.addi"(%185, %47) : (index, index) -> index | |
%206 = "memref.load"(%84, %205) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%207 = "arith.addi"(%185, %46) : (index, index) -> index | |
%208 = "memref.load"(%84, %207) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%209 = "arith.addi"(%185, %45) : (index, index) -> index | |
%210 = "memref.load"(%84, %209) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%211 = "arith.addi"(%185, %44) : (index, index) -> index | |
%212 = "memref.load"(%84, %211) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%213 = "arith.addi"(%185, %43) : (index, index) -> index | |
%214 = "memref.load"(%84, %213) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%215 = "arith.addi"(%185, %71) : (index, index) -> index | |
%216 = "memref.load"(%84, %215) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%217 = "arith.addi"(%185, %60) : (index, index) -> index | |
%218 = "memref.load"(%84, %217) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%219 = "arith.addi"(%185, %42) : (index, index) -> index | |
%220 = "memref.load"(%84, %219) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%221 = "arith.addi"(%185, %41) : (index, index) -> index | |
%222 = "memref.load"(%84, %221) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%223 = "arith.addi"(%185, %40) : (index, index) -> index | |
%224 = "memref.load"(%84, %223) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%225 = "arith.addi"(%185, %39) : (index, index) -> index | |
%226 = "memref.load"(%84, %225) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%227 = "arith.addi"(%185, %38) : (index, index) -> index | |
%228 = "memref.load"(%84, %227) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%229 = "arith.addi"(%185, %37) : (index, index) -> index | |
%230 = "memref.load"(%84, %229) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%231 = "arith.addi"(%185, %36) : (index, index) -> index | |
%232 = "memref.load"(%84, %231) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%233 = "arith.addi"(%185, %35) : (index, index) -> index | |
%234 = "memref.load"(%84, %233) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%235 = "arith.addi"(%185, %34) : (index, index) -> index | |
%236 = "memref.load"(%84, %235) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%237 = "arith.addi"(%185, %33) : (index, index) -> index | |
%238 = "memref.load"(%84, %237) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%239 = "arith.addi"(%185, %32) : (index, index) -> index | |
%240 = "memref.load"(%84, %239) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%241 = "arith.addi"(%185, %31) : (index, index) -> index | |
%242 = "memref.load"(%84, %241) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%243 = "arith.addi"(%185, %80) : (index, index) -> index | |
%244 = "memref.load"(%84, %243) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%245 = "arith.addi"(%185, %30) : (index, index) -> index | |
%246 = "memref.load"(%84, %245) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%247 = "arith.addi"(%185, %29) : (index, index) -> index | |
%248 = "memref.load"(%84, %247) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%249 = "memref.load"(%85, %81) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%250 = "arith.addi"(%81, %59) : (index, index) -> index | |
%251 = "memref.load"(%85, %250) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%252 = "arith.addi"(%81, %29) : (index, index) -> index | |
%253 = "memref.load"(%85, %252) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%254 = "arith.addi"(%81, %28) : (index, index) -> index | |
%255 = "memref.load"(%85, %254) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%256 = "arith.addi"(%81, %27) : (index, index) -> index | |
%257 = "memref.load"(%85, %256) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%258 = "arith.addi"(%81, %26) : (index, index) -> index | |
%259 = "memref.load"(%85, %258) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%260 = "arith.addi"(%81, %25) : (index, index) -> index | |
%261 = "memref.load"(%85, %260) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%262 = "arith.addi"(%81, %24) : (index, index) -> index | |
%263 = "memref.load"(%85, %262) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%264 = "arith.addi"(%81, %23) : (index, index) -> index | |
%265 = "memref.load"(%85, %264) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%266 = "arith.addi"(%81, %22) : (index, index) -> index | |
%267 = "memref.load"(%85, %266) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%268 = "arith.addi"(%81, %21) : (index, index) -> index | |
%269 = "memref.load"(%85, %268) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%270 = "arith.addi"(%81, %20) : (index, index) -> index | |
%271 = "memref.load"(%85, %270) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%272 = "arith.addi"(%81, %19) : (index, index) -> index | |
%273 = "memref.load"(%85, %272) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%274 = "arith.addi"(%81, %18) : (index, index) -> index | |
%275 = "memref.load"(%85, %274) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%276 = "arith.addi"(%81, %17) : (index, index) -> index | |
%277 = "memref.load"(%85, %276) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%278 = "arith.addi"(%81, %16) : (index, index) -> index | |
%279 = "memref.load"(%85, %278) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%280 = "arith.addi"(%81, %58) : (index, index) -> index | |
%281 = "memref.load"(%85, %280) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%282 = "arith.addi"(%81, %15) : (index, index) -> index | |
%283 = "memref.load"(%85, %282) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%284 = "arith.addi"(%81, %14) : (index, index) -> index | |
%285 = "memref.load"(%85, %284) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%286 = "arith.addi"(%81, %13) : (index, index) -> index | |
%287 = "memref.load"(%85, %286) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%288 = "arith.addi"(%81, %12) : (index, index) -> index | |
%289 = "memref.load"(%85, %288) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%290 = "arith.addi"(%81, %11) : (index, index) -> index | |
%291 = "memref.load"(%85, %290) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%292 = "arith.addi"(%81, %10) : (index, index) -> index | |
%293 = "memref.load"(%85, %292) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%294 = "arith.addi"(%81, %9) : (index, index) -> index | |
%295 = "memref.load"(%85, %294) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%296 = "arith.addi"(%81, %8) : (index, index) -> index | |
%297 = "memref.load"(%85, %296) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%298 = "arith.addi"(%81, %7) : (index, index) -> index | |
%299 = "memref.load"(%85, %298) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%300 = "arith.addi"(%81, %6) : (index, index) -> index | |
%301 = "memref.load"(%85, %300) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%302 = "arith.addi"(%81, %5) : (index, index) -> index | |
%303 = "memref.load"(%85, %302) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%304 = "arith.addi"(%81, %4) : (index, index) -> index | |
%305 = "memref.load"(%85, %304) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%306 = "arith.addi"(%81, %3) : (index, index) -> index | |
%307 = "memref.load"(%85, %306) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%308 = "arith.addi"(%81, %2) : (index, index) -> index | |
%309 = "memref.load"(%85, %308) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%310 = "arith.addi"(%81, %1) : (index, index) -> index | |
%311 = "memref.load"(%85, %310) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
%312 = "vector.extract"(%186) {position = [0]} : (vector<4xf32>) -> f32 | |
%313 = "vector.splat"(%312) : (f32) -> vector<4xf32> | |
%314 = "vector.fma"(%313, %249, %172#0) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%315 = "vector.extract"(%186) {position = [1]} : (vector<4xf32>) -> f32 | |
%316 = "vector.splat"(%315) : (f32) -> vector<4xf32> | |
%317 = "vector.fma"(%316, %251, %314) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%318 = "vector.extract"(%186) {position = [2]} : (vector<4xf32>) -> f32 | |
%319 = "vector.splat"(%318) : (f32) -> vector<4xf32> | |
%320 = "vector.fma"(%319, %253, %317) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%321 = "vector.extract"(%186) {position = [3]} : (vector<4xf32>) -> f32 | |
%322 = "vector.splat"(%321) : (f32) -> vector<4xf32> | |
%323 = "vector.fma"(%322, %255, %320) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%324 = "vector.extract"(%188) {position = [0]} : (vector<4xf32>) -> f32 | |
%325 = "vector.splat"(%324) : (f32) -> vector<4xf32> | |
%326 = "vector.fma"(%325, %257, %323) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%327 = "vector.extract"(%188) {position = [1]} : (vector<4xf32>) -> f32 | |
%328 = "vector.splat"(%327) : (f32) -> vector<4xf32> | |
%329 = "vector.fma"(%328, %259, %326) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%330 = "vector.extract"(%188) {position = [2]} : (vector<4xf32>) -> f32 | |
%331 = "vector.splat"(%330) : (f32) -> vector<4xf32> | |
%332 = "vector.fma"(%331, %261, %329) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%333 = "vector.extract"(%188) {position = [3]} : (vector<4xf32>) -> f32 | |
%334 = "vector.splat"(%333) : (f32) -> vector<4xf32> | |
%335 = "vector.fma"(%334, %263, %332) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%336 = "vector.extract"(%190) {position = [0]} : (vector<4xf32>) -> f32 | |
%337 = "vector.splat"(%336) : (f32) -> vector<4xf32> | |
%338 = "vector.fma"(%337, %265, %335) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%339 = "vector.extract"(%190) {position = [1]} : (vector<4xf32>) -> f32 | |
%340 = "vector.splat"(%339) : (f32) -> vector<4xf32> | |
%341 = "vector.fma"(%340, %267, %338) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%342 = "vector.extract"(%190) {position = [2]} : (vector<4xf32>) -> f32 | |
%343 = "vector.splat"(%342) : (f32) -> vector<4xf32> | |
%344 = "vector.fma"(%343, %269, %341) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%345 = "vector.extract"(%190) {position = [3]} : (vector<4xf32>) -> f32 | |
%346 = "vector.splat"(%345) : (f32) -> vector<4xf32> | |
%347 = "vector.fma"(%346, %271, %344) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%348 = "vector.extract"(%192) {position = [0]} : (vector<4xf32>) -> f32 | |
%349 = "vector.splat"(%348) : (f32) -> vector<4xf32> | |
%350 = "vector.fma"(%349, %273, %347) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%351 = "vector.extract"(%192) {position = [1]} : (vector<4xf32>) -> f32 | |
%352 = "vector.splat"(%351) : (f32) -> vector<4xf32> | |
%353 = "vector.fma"(%352, %275, %350) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%354 = "vector.extract"(%192) {position = [2]} : (vector<4xf32>) -> f32 | |
%355 = "vector.splat"(%354) : (f32) -> vector<4xf32> | |
%356 = "vector.fma"(%355, %277, %353) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%357 = "vector.extract"(%192) {position = [3]} : (vector<4xf32>) -> f32 | |
%358 = "vector.splat"(%357) : (f32) -> vector<4xf32> | |
%359 = "vector.fma"(%358, %279, %356) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%360 = "vector.extract"(%194) {position = [0]} : (vector<4xf32>) -> f32 | |
%361 = "vector.splat"(%360) : (f32) -> vector<4xf32> | |
%362 = "vector.fma"(%361, %281, %359) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%363 = "vector.extract"(%194) {position = [1]} : (vector<4xf32>) -> f32 | |
%364 = "vector.splat"(%363) : (f32) -> vector<4xf32> | |
%365 = "vector.fma"(%364, %283, %362) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%366 = "vector.extract"(%194) {position = [2]} : (vector<4xf32>) -> f32 | |
%367 = "vector.splat"(%366) : (f32) -> vector<4xf32> | |
%368 = "vector.fma"(%367, %285, %365) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%369 = "vector.extract"(%194) {position = [3]} : (vector<4xf32>) -> f32 | |
%370 = "vector.splat"(%369) : (f32) -> vector<4xf32> | |
%371 = "vector.fma"(%370, %287, %368) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%372 = "vector.extract"(%196) {position = [0]} : (vector<4xf32>) -> f32 | |
%373 = "vector.splat"(%372) : (f32) -> vector<4xf32> | |
%374 = "vector.fma"(%373, %289, %371) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%375 = "vector.extract"(%196) {position = [1]} : (vector<4xf32>) -> f32 | |
%376 = "vector.splat"(%375) : (f32) -> vector<4xf32> | |
%377 = "vector.fma"(%376, %291, %374) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%378 = "vector.extract"(%196) {position = [2]} : (vector<4xf32>) -> f32 | |
%379 = "vector.splat"(%378) : (f32) -> vector<4xf32> | |
%380 = "vector.fma"(%379, %293, %377) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%381 = "vector.extract"(%196) {position = [3]} : (vector<4xf32>) -> f32 | |
%382 = "vector.splat"(%381) : (f32) -> vector<4xf32> | |
%383 = "vector.fma"(%382, %295, %380) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%384 = "vector.extract"(%198) {position = [0]} : (vector<4xf32>) -> f32 | |
%385 = "vector.splat"(%384) : (f32) -> vector<4xf32> | |
%386 = "vector.fma"(%385, %297, %383) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%387 = "vector.extract"(%198) {position = [1]} : (vector<4xf32>) -> f32 | |
%388 = "vector.splat"(%387) : (f32) -> vector<4xf32> | |
%389 = "vector.fma"(%388, %299, %386) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%390 = "vector.extract"(%198) {position = [2]} : (vector<4xf32>) -> f32 | |
%391 = "vector.splat"(%390) : (f32) -> vector<4xf32> | |
%392 = "vector.fma"(%391, %301, %389) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%393 = "vector.extract"(%198) {position = [3]} : (vector<4xf32>) -> f32 | |
%394 = "vector.splat"(%393) : (f32) -> vector<4xf32> | |
%395 = "vector.fma"(%394, %303, %392) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%396 = "vector.extract"(%200) {position = [0]} : (vector<4xf32>) -> f32 | |
%397 = "vector.splat"(%396) : (f32) -> vector<4xf32> | |
%398 = "vector.fma"(%397, %305, %395) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%399 = "vector.extract"(%200) {position = [1]} : (vector<4xf32>) -> f32 | |
%400 = "vector.splat"(%399) : (f32) -> vector<4xf32> | |
%401 = "vector.fma"(%400, %307, %398) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%402 = "vector.extract"(%200) {position = [2]} : (vector<4xf32>) -> f32 | |
%403 = "vector.splat"(%402) : (f32) -> vector<4xf32> | |
%404 = "vector.fma"(%403, %309, %401) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%405 = "vector.extract"(%200) {position = [3]} : (vector<4xf32>) -> f32 | |
%406 = "vector.splat"(%405) : (f32) -> vector<4xf32> | |
%407 = "vector.fma"(%406, %311, %404) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%408 = "vector.extract"(%202) {position = [0]} : (vector<4xf32>) -> f32 | |
%409 = "vector.splat"(%408) : (f32) -> vector<4xf32> | |
%410 = "vector.fma"(%409, %249, %172#1) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%411 = "vector.extract"(%202) {position = [1]} : (vector<4xf32>) -> f32 | |
%412 = "vector.splat"(%411) : (f32) -> vector<4xf32> | |
%413 = "vector.fma"(%412, %251, %410) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%414 = "vector.extract"(%202) {position = [2]} : (vector<4xf32>) -> f32 | |
%415 = "vector.splat"(%414) : (f32) -> vector<4xf32> | |
%416 = "vector.fma"(%415, %253, %413) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%417 = "vector.extract"(%202) {position = [3]} : (vector<4xf32>) -> f32 | |
%418 = "vector.splat"(%417) : (f32) -> vector<4xf32> | |
%419 = "vector.fma"(%418, %255, %416) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%420 = "vector.extract"(%204) {position = [0]} : (vector<4xf32>) -> f32 | |
%421 = "vector.splat"(%420) : (f32) -> vector<4xf32> | |
%422 = "vector.fma"(%421, %257, %419) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%423 = "vector.extract"(%204) {position = [1]} : (vector<4xf32>) -> f32 | |
%424 = "vector.splat"(%423) : (f32) -> vector<4xf32> | |
%425 = "vector.fma"(%424, %259, %422) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%426 = "vector.extract"(%204) {position = [2]} : (vector<4xf32>) -> f32 | |
%427 = "vector.splat"(%426) : (f32) -> vector<4xf32> | |
%428 = "vector.fma"(%427, %261, %425) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%429 = "vector.extract"(%204) {position = [3]} : (vector<4xf32>) -> f32 | |
%430 = "vector.splat"(%429) : (f32) -> vector<4xf32> | |
%431 = "vector.fma"(%430, %263, %428) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%432 = "vector.extract"(%206) {position = [0]} : (vector<4xf32>) -> f32 | |
%433 = "vector.splat"(%432) : (f32) -> vector<4xf32> | |
%434 = "vector.fma"(%433, %265, %431) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%435 = "vector.extract"(%206) {position = [1]} : (vector<4xf32>) -> f32 | |
%436 = "vector.splat"(%435) : (f32) -> vector<4xf32> | |
%437 = "vector.fma"(%436, %267, %434) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%438 = "vector.extract"(%206) {position = [2]} : (vector<4xf32>) -> f32 | |
%439 = "vector.splat"(%438) : (f32) -> vector<4xf32> | |
%440 = "vector.fma"(%439, %269, %437) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%441 = "vector.extract"(%206) {position = [3]} : (vector<4xf32>) -> f32 | |
%442 = "vector.splat"(%441) : (f32) -> vector<4xf32> | |
%443 = "vector.fma"(%442, %271, %440) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%444 = "vector.extract"(%208) {position = [0]} : (vector<4xf32>) -> f32 | |
%445 = "vector.splat"(%444) : (f32) -> vector<4xf32> | |
%446 = "vector.fma"(%445, %273, %443) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%447 = "vector.extract"(%208) {position = [1]} : (vector<4xf32>) -> f32 | |
%448 = "vector.splat"(%447) : (f32) -> vector<4xf32> | |
%449 = "vector.fma"(%448, %275, %446) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%450 = "vector.extract"(%208) {position = [2]} : (vector<4xf32>) -> f32 | |
%451 = "vector.splat"(%450) : (f32) -> vector<4xf32> | |
%452 = "vector.fma"(%451, %277, %449) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%453 = "vector.extract"(%208) {position = [3]} : (vector<4xf32>) -> f32 | |
%454 = "vector.splat"(%453) : (f32) -> vector<4xf32> | |
%455 = "vector.fma"(%454, %279, %452) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%456 = "vector.extract"(%210) {position = [0]} : (vector<4xf32>) -> f32 | |
%457 = "vector.splat"(%456) : (f32) -> vector<4xf32> | |
%458 = "vector.fma"(%457, %281, %455) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%459 = "vector.extract"(%210) {position = [1]} : (vector<4xf32>) -> f32 | |
%460 = "vector.splat"(%459) : (f32) -> vector<4xf32> | |
%461 = "vector.fma"(%460, %283, %458) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%462 = "vector.extract"(%210) {position = [2]} : (vector<4xf32>) -> f32 | |
%463 = "vector.splat"(%462) : (f32) -> vector<4xf32> | |
%464 = "vector.fma"(%463, %285, %461) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%465 = "vector.extract"(%210) {position = [3]} : (vector<4xf32>) -> f32 | |
%466 = "vector.splat"(%465) : (f32) -> vector<4xf32> | |
%467 = "vector.fma"(%466, %287, %464) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%468 = "vector.extract"(%212) {position = [0]} : (vector<4xf32>) -> f32 | |
%469 = "vector.splat"(%468) : (f32) -> vector<4xf32> | |
%470 = "vector.fma"(%469, %289, %467) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%471 = "vector.extract"(%212) {position = [1]} : (vector<4xf32>) -> f32 | |
%472 = "vector.splat"(%471) : (f32) -> vector<4xf32> | |
%473 = "vector.fma"(%472, %291, %470) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%474 = "vector.extract"(%212) {position = [2]} : (vector<4xf32>) -> f32 | |
%475 = "vector.splat"(%474) : (f32) -> vector<4xf32> | |
%476 = "vector.fma"(%475, %293, %473) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%477 = "vector.extract"(%212) {position = [3]} : (vector<4xf32>) -> f32 | |
%478 = "vector.splat"(%477) : (f32) -> vector<4xf32> | |
%479 = "vector.fma"(%478, %295, %476) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%480 = "vector.extract"(%214) {position = [0]} : (vector<4xf32>) -> f32 | |
%481 = "vector.splat"(%480) : (f32) -> vector<4xf32> | |
%482 = "vector.fma"(%481, %297, %479) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%483 = "vector.extract"(%214) {position = [1]} : (vector<4xf32>) -> f32 | |
%484 = "vector.splat"(%483) : (f32) -> vector<4xf32> | |
%485 = "vector.fma"(%484, %299, %482) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%486 = "vector.extract"(%214) {position = [2]} : (vector<4xf32>) -> f32 | |
%487 = "vector.splat"(%486) : (f32) -> vector<4xf32> | |
%488 = "vector.fma"(%487, %301, %485) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%489 = "vector.extract"(%214) {position = [3]} : (vector<4xf32>) -> f32 | |
%490 = "vector.splat"(%489) : (f32) -> vector<4xf32> | |
%491 = "vector.fma"(%490, %303, %488) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%492 = "vector.extract"(%216) {position = [0]} : (vector<4xf32>) -> f32 | |
%493 = "vector.splat"(%492) : (f32) -> vector<4xf32> | |
%494 = "vector.fma"(%493, %305, %491) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%495 = "vector.extract"(%216) {position = [1]} : (vector<4xf32>) -> f32 | |
%496 = "vector.splat"(%495) : (f32) -> vector<4xf32> | |
%497 = "vector.fma"(%496, %307, %494) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%498 = "vector.extract"(%216) {position = [2]} : (vector<4xf32>) -> f32 | |
%499 = "vector.splat"(%498) : (f32) -> vector<4xf32> | |
%500 = "vector.fma"(%499, %309, %497) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%501 = "vector.extract"(%216) {position = [3]} : (vector<4xf32>) -> f32 | |
%502 = "vector.splat"(%501) : (f32) -> vector<4xf32> | |
%503 = "vector.fma"(%502, %311, %500) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%504 = "vector.extract"(%218) {position = [0]} : (vector<4xf32>) -> f32 | |
%505 = "vector.splat"(%504) : (f32) -> vector<4xf32> | |
%506 = "vector.fma"(%505, %249, %172#2) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%507 = "vector.extract"(%218) {position = [1]} : (vector<4xf32>) -> f32 | |
%508 = "vector.splat"(%507) : (f32) -> vector<4xf32> | |
%509 = "vector.fma"(%508, %251, %506) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%510 = "vector.extract"(%218) {position = [2]} : (vector<4xf32>) -> f32 | |
%511 = "vector.splat"(%510) : (f32) -> vector<4xf32> | |
%512 = "vector.fma"(%511, %253, %509) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%513 = "vector.extract"(%218) {position = [3]} : (vector<4xf32>) -> f32 | |
%514 = "vector.splat"(%513) : (f32) -> vector<4xf32> | |
%515 = "vector.fma"(%514, %255, %512) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%516 = "vector.extract"(%220) {position = [0]} : (vector<4xf32>) -> f32 | |
%517 = "vector.splat"(%516) : (f32) -> vector<4xf32> | |
%518 = "vector.fma"(%517, %257, %515) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%519 = "vector.extract"(%220) {position = [1]} : (vector<4xf32>) -> f32 | |
%520 = "vector.splat"(%519) : (f32) -> vector<4xf32> | |
%521 = "vector.fma"(%520, %259, %518) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%522 = "vector.extract"(%220) {position = [2]} : (vector<4xf32>) -> f32 | |
%523 = "vector.splat"(%522) : (f32) -> vector<4xf32> | |
%524 = "vector.fma"(%523, %261, %521) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%525 = "vector.extract"(%220) {position = [3]} : (vector<4xf32>) -> f32 | |
%526 = "vector.splat"(%525) : (f32) -> vector<4xf32> | |
%527 = "vector.fma"(%526, %263, %524) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%528 = "vector.extract"(%222) {position = [0]} : (vector<4xf32>) -> f32 | |
%529 = "vector.splat"(%528) : (f32) -> vector<4xf32> | |
%530 = "vector.fma"(%529, %265, %527) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%531 = "vector.extract"(%222) {position = [1]} : (vector<4xf32>) -> f32 | |
%532 = "vector.splat"(%531) : (f32) -> vector<4xf32> | |
%533 = "vector.fma"(%532, %267, %530) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%534 = "vector.extract"(%222) {position = [2]} : (vector<4xf32>) -> f32 | |
%535 = "vector.splat"(%534) : (f32) -> vector<4xf32> | |
%536 = "vector.fma"(%535, %269, %533) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%537 = "vector.extract"(%222) {position = [3]} : (vector<4xf32>) -> f32 | |
%538 = "vector.splat"(%537) : (f32) -> vector<4xf32> | |
%539 = "vector.fma"(%538, %271, %536) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%540 = "vector.extract"(%224) {position = [0]} : (vector<4xf32>) -> f32 | |
%541 = "vector.splat"(%540) : (f32) -> vector<4xf32> | |
%542 = "vector.fma"(%541, %273, %539) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%543 = "vector.extract"(%224) {position = [1]} : (vector<4xf32>) -> f32 | |
%544 = "vector.splat"(%543) : (f32) -> vector<4xf32> | |
%545 = "vector.fma"(%544, %275, %542) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%546 = "vector.extract"(%224) {position = [2]} : (vector<4xf32>) -> f32 | |
%547 = "vector.splat"(%546) : (f32) -> vector<4xf32> | |
%548 = "vector.fma"(%547, %277, %545) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%549 = "vector.extract"(%224) {position = [3]} : (vector<4xf32>) -> f32 | |
%550 = "vector.splat"(%549) : (f32) -> vector<4xf32> | |
%551 = "vector.fma"(%550, %279, %548) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%552 = "vector.extract"(%226) {position = [0]} : (vector<4xf32>) -> f32 | |
%553 = "vector.splat"(%552) : (f32) -> vector<4xf32> | |
%554 = "vector.fma"(%553, %281, %551) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%555 = "vector.extract"(%226) {position = [1]} : (vector<4xf32>) -> f32 | |
%556 = "vector.splat"(%555) : (f32) -> vector<4xf32> | |
%557 = "vector.fma"(%556, %283, %554) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%558 = "vector.extract"(%226) {position = [2]} : (vector<4xf32>) -> f32 | |
%559 = "vector.splat"(%558) : (f32) -> vector<4xf32> | |
%560 = "vector.fma"(%559, %285, %557) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%561 = "vector.extract"(%226) {position = [3]} : (vector<4xf32>) -> f32 | |
%562 = "vector.splat"(%561) : (f32) -> vector<4xf32> | |
%563 = "vector.fma"(%562, %287, %560) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%564 = "vector.extract"(%228) {position = [0]} : (vector<4xf32>) -> f32 | |
%565 = "vector.splat"(%564) : (f32) -> vector<4xf32> | |
%566 = "vector.fma"(%565, %289, %563) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%567 = "vector.extract"(%228) {position = [1]} : (vector<4xf32>) -> f32 | |
%568 = "vector.splat"(%567) : (f32) -> vector<4xf32> | |
%569 = "vector.fma"(%568, %291, %566) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%570 = "vector.extract"(%228) {position = [2]} : (vector<4xf32>) -> f32 | |
%571 = "vector.splat"(%570) : (f32) -> vector<4xf32> | |
%572 = "vector.fma"(%571, %293, %569) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%573 = "vector.extract"(%228) {position = [3]} : (vector<4xf32>) -> f32 | |
%574 = "vector.splat"(%573) : (f32) -> vector<4xf32> | |
%575 = "vector.fma"(%574, %295, %572) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%576 = "vector.extract"(%230) {position = [0]} : (vector<4xf32>) -> f32 | |
%577 = "vector.splat"(%576) : (f32) -> vector<4xf32> | |
%578 = "vector.fma"(%577, %297, %575) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%579 = "vector.extract"(%230) {position = [1]} : (vector<4xf32>) -> f32 | |
%580 = "vector.splat"(%579) : (f32) -> vector<4xf32> | |
%581 = "vector.fma"(%580, %299, %578) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%582 = "vector.extract"(%230) {position = [2]} : (vector<4xf32>) -> f32 | |
%583 = "vector.splat"(%582) : (f32) -> vector<4xf32> | |
%584 = "vector.fma"(%583, %301, %581) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%585 = "vector.extract"(%230) {position = [3]} : (vector<4xf32>) -> f32 | |
%586 = "vector.splat"(%585) : (f32) -> vector<4xf32> | |
%587 = "vector.fma"(%586, %303, %584) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%588 = "vector.extract"(%232) {position = [0]} : (vector<4xf32>) -> f32 | |
%589 = "vector.splat"(%588) : (f32) -> vector<4xf32> | |
%590 = "vector.fma"(%589, %305, %587) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%591 = "vector.extract"(%232) {position = [1]} : (vector<4xf32>) -> f32 | |
%592 = "vector.splat"(%591) : (f32) -> vector<4xf32> | |
%593 = "vector.fma"(%592, %307, %590) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%594 = "vector.extract"(%232) {position = [2]} : (vector<4xf32>) -> f32 | |
%595 = "vector.splat"(%594) : (f32) -> vector<4xf32> | |
%596 = "vector.fma"(%595, %309, %593) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%597 = "vector.extract"(%232) {position = [3]} : (vector<4xf32>) -> f32 | |
%598 = "vector.splat"(%597) : (f32) -> vector<4xf32> | |
%599 = "vector.fma"(%598, %311, %596) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%600 = "vector.extract"(%234) {position = [0]} : (vector<4xf32>) -> f32 | |
%601 = "vector.splat"(%600) : (f32) -> vector<4xf32> | |
%602 = "vector.fma"(%601, %249, %172#3) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%603 = "vector.extract"(%234) {position = [1]} : (vector<4xf32>) -> f32 | |
%604 = "vector.splat"(%603) : (f32) -> vector<4xf32> | |
%605 = "vector.fma"(%604, %251, %602) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%606 = "vector.extract"(%234) {position = [2]} : (vector<4xf32>) -> f32 | |
%607 = "vector.splat"(%606) : (f32) -> vector<4xf32> | |
%608 = "vector.fma"(%607, %253, %605) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%609 = "vector.extract"(%234) {position = [3]} : (vector<4xf32>) -> f32 | |
%610 = "vector.splat"(%609) : (f32) -> vector<4xf32> | |
%611 = "vector.fma"(%610, %255, %608) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%612 = "vector.extract"(%236) {position = [0]} : (vector<4xf32>) -> f32 | |
%613 = "vector.splat"(%612) : (f32) -> vector<4xf32> | |
%614 = "vector.fma"(%613, %257, %611) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%615 = "vector.extract"(%236) {position = [1]} : (vector<4xf32>) -> f32 | |
%616 = "vector.splat"(%615) : (f32) -> vector<4xf32> | |
%617 = "vector.fma"(%616, %259, %614) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%618 = "vector.extract"(%236) {position = [2]} : (vector<4xf32>) -> f32 | |
%619 = "vector.splat"(%618) : (f32) -> vector<4xf32> | |
%620 = "vector.fma"(%619, %261, %617) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%621 = "vector.extract"(%236) {position = [3]} : (vector<4xf32>) -> f32 | |
%622 = "vector.splat"(%621) : (f32) -> vector<4xf32> | |
%623 = "vector.fma"(%622, %263, %620) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%624 = "vector.extract"(%238) {position = [0]} : (vector<4xf32>) -> f32 | |
%625 = "vector.splat"(%624) : (f32) -> vector<4xf32> | |
%626 = "vector.fma"(%625, %265, %623) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%627 = "vector.extract"(%238) {position = [1]} : (vector<4xf32>) -> f32 | |
%628 = "vector.splat"(%627) : (f32) -> vector<4xf32> | |
%629 = "vector.fma"(%628, %267, %626) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%630 = "vector.extract"(%238) {position = [2]} : (vector<4xf32>) -> f32 | |
%631 = "vector.splat"(%630) : (f32) -> vector<4xf32> | |
%632 = "vector.fma"(%631, %269, %629) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%633 = "vector.extract"(%238) {position = [3]} : (vector<4xf32>) -> f32 | |
%634 = "vector.splat"(%633) : (f32) -> vector<4xf32> | |
%635 = "vector.fma"(%634, %271, %632) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%636 = "vector.extract"(%240) {position = [0]} : (vector<4xf32>) -> f32 | |
%637 = "vector.splat"(%636) : (f32) -> vector<4xf32> | |
%638 = "vector.fma"(%637, %273, %635) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%639 = "vector.extract"(%240) {position = [1]} : (vector<4xf32>) -> f32 | |
%640 = "vector.splat"(%639) : (f32) -> vector<4xf32> | |
%641 = "vector.fma"(%640, %275, %638) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%642 = "vector.extract"(%240) {position = [2]} : (vector<4xf32>) -> f32 | |
%643 = "vector.splat"(%642) : (f32) -> vector<4xf32> | |
%644 = "vector.fma"(%643, %277, %641) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%645 = "vector.extract"(%240) {position = [3]} : (vector<4xf32>) -> f32 | |
%646 = "vector.splat"(%645) : (f32) -> vector<4xf32> | |
%647 = "vector.fma"(%646, %279, %644) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%648 = "vector.extract"(%242) {position = [0]} : (vector<4xf32>) -> f32 | |
%649 = "vector.splat"(%648) : (f32) -> vector<4xf32> | |
%650 = "vector.fma"(%649, %281, %647) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%651 = "vector.extract"(%242) {position = [1]} : (vector<4xf32>) -> f32 | |
%652 = "vector.splat"(%651) : (f32) -> vector<4xf32> | |
%653 = "vector.fma"(%652, %283, %650) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%654 = "vector.extract"(%242) {position = [2]} : (vector<4xf32>) -> f32 | |
%655 = "vector.splat"(%654) : (f32) -> vector<4xf32> | |
%656 = "vector.fma"(%655, %285, %653) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%657 = "vector.extract"(%242) {position = [3]} : (vector<4xf32>) -> f32 | |
%658 = "vector.splat"(%657) : (f32) -> vector<4xf32> | |
%659 = "vector.fma"(%658, %287, %656) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%660 = "vector.extract"(%244) {position = [0]} : (vector<4xf32>) -> f32 | |
%661 = "vector.splat"(%660) : (f32) -> vector<4xf32> | |
%662 = "vector.fma"(%661, %289, %659) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%663 = "vector.extract"(%244) {position = [1]} : (vector<4xf32>) -> f32 | |
%664 = "vector.splat"(%663) : (f32) -> vector<4xf32> | |
%665 = "vector.fma"(%664, %291, %662) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%666 = "vector.extract"(%244) {position = [2]} : (vector<4xf32>) -> f32 | |
%667 = "vector.splat"(%666) : (f32) -> vector<4xf32> | |
%668 = "vector.fma"(%667, %293, %665) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%669 = "vector.extract"(%244) {position = [3]} : (vector<4xf32>) -> f32 | |
%670 = "vector.splat"(%669) : (f32) -> vector<4xf32> | |
%671 = "vector.fma"(%670, %295, %668) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%672 = "vector.extract"(%246) {position = [0]} : (vector<4xf32>) -> f32 | |
%673 = "vector.splat"(%672) : (f32) -> vector<4xf32> | |
%674 = "vector.fma"(%673, %297, %671) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%675 = "vector.extract"(%246) {position = [1]} : (vector<4xf32>) -> f32 | |
%676 = "vector.splat"(%675) : (f32) -> vector<4xf32> | |
%677 = "vector.fma"(%676, %299, %674) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%678 = "vector.extract"(%246) {position = [2]} : (vector<4xf32>) -> f32 | |
%679 = "vector.splat"(%678) : (f32) -> vector<4xf32> | |
%680 = "vector.fma"(%679, %301, %677) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%681 = "vector.extract"(%246) {position = [3]} : (vector<4xf32>) -> f32 | |
%682 = "vector.splat"(%681) : (f32) -> vector<4xf32> | |
%683 = "vector.fma"(%682, %303, %680) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%684 = "vector.extract"(%248) {position = [0]} : (vector<4xf32>) -> f32 | |
%685 = "vector.splat"(%684) : (f32) -> vector<4xf32> | |
%686 = "vector.fma"(%685, %305, %683) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%687 = "vector.extract"(%248) {position = [1]} : (vector<4xf32>) -> f32 | |
%688 = "vector.splat"(%687) : (f32) -> vector<4xf32> | |
%689 = "vector.fma"(%688, %307, %686) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%690 = "vector.extract"(%248) {position = [2]} : (vector<4xf32>) -> f32 | |
%691 = "vector.splat"(%690) : (f32) -> vector<4xf32> | |
%692 = "vector.fma"(%691, %309, %689) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
%693 = "vector.extract"(%248) {position = [3]} : (vector<4xf32>) -> f32 | |
%694 = "vector.splat"(%693) : (f32) -> vector<4xf32> | |
%695 = "vector.fma"(%694, %311, %692) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
"memref.store"(%695, %109, %130) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"memref.store"(%599, %110, %129) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"memref.store"(%503, %111, %128) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"memref.store"(%407, %112, %127) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
"func.return"() : () -> () | |
}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [16, 16, 1]>, sym_name = "forward_dispatch_35_matmul_18432x320x320"} : () -> () | |
}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
"hal.executable.variant_end"() : () -> () | |
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
"hal.executable_end"() : () -> () | |
}) {sym_name = "forward_dispatch_35", sym_visibility = "private"} : () -> () | |
%133 = linalg.matmul ins(%collapsed_749, %130 : tensor<18432x320xf32>, tensor<320x320xf32>) outs(%132 : tensor<18432x320xf32>) -> tensor<18432x320xf32> | |
^ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment