Created
          February 9, 2023 05:02 
        
      - 
      
- 
        Save pashu123/b3449d68be413ecc828a5892ee1ec598 to your computer and use it in GitHub Desktop. 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | /home/prashant/stable.mlir:793:11: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
| %10 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x160xf32>) outs(%8 : tensor<2x160xf32>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:793:11: note: see current operation: %20 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %10 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x160xf32>) outs(%8 : tensor<2x160xf32>) { | |
| ^ | |
| /home/prashant/stable.mlir:793:11: note: see existing live user here: %28 = "spirv.UConvert"(%20) : (i32) -> i64 | |
| /home/prashant/stable.mlir:804:27: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
| %inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32> | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:804:27: note: see current operation: | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
| %0 = "arith.constant"() {value = 5 : index} : () -> index | |
| %1 = "arith.constant"() {value = 2 : index} : () -> index | |
| %2 = "arith.constant"() {value = 1 : index} : () -> index | |
| "hal.return"(%0, %1, %2) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_3", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = -1 : index} : () -> index | |
| %1 = "arith.constant"() {value = 4 : index} : () -> index | |
| %2 = "arith.constant"() {value = 32 : index} : () -> index | |
| %3 = "arith.constant"() {value = 160 : index} : () -> index | |
| %4 = "arith.constant"() {value = 0 : index} : () -> index | |
| %5 = "arith.constant"() {value = 640 : index} : () -> index | |
| %6 = "arith.constant"() {value = 320 : index} : () -> index | |
| %7 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %8 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
| %9 = "arith.index_castui"(%7) : (i32) -> index | |
| %10 = "arith.index_castui"(%8) : (i32) -> index | |
| %11 = "hal.interface.binding.subspan"(%9, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %12 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %13 = "hal.interface.binding.subspan"(%10, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %14 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %15 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %16 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %17 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| %18 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
| %19 = "arith.muli"(%16, %3) : (index, index) -> index | |
| %20 = "arith.muli"(%18, %3) : (index, index) -> index | |
| %21 = "arith.addi"(%19, %20) : (index, index) -> index | |
| %22 = "arith.muli"(%15, %2) : (index, index) -> index | |
| %23 = "arith.addi"(%21, %22) : (index, index) -> index | |
| %24 = "arith.addi"(%23, %17) : (index, index) -> index | |
| %25 = "arith.cmpi"(%9, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %26 = "arith.subi"(%0, %9) : (index, index) -> index | |
| %27 = "arith.select"(%25, %26, %9) : (i1, index, index) -> index | |
| %28 = "arith.divsi"(%27, %1) : (index, index) -> index | |
| %29 = "arith.subi"(%0, %28) : (index, index) -> index | |
| %30 = "arith.select"(%25, %29, %28) : (i1, index, index) -> index | |
| %31 = "arith.addi"(%24, %30) : (index, index) -> index | |
| %32 = "memref.load"(%12, %31) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %33 = "arith.muli"(%16, %6) : (index, index) -> index | |
| %34 = "arith.muli"(%18, %6) : (index, index) -> index | |
| %35 = "arith.addi"(%33, %34) : (index, index) -> index | |
| %36 = "arith.addi"(%35, %22) : (index, index) -> index | |
| %37 = "arith.addi"(%36, %17) : (index, index) -> index | |
| %38 = "arith.cmpi"(%10, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %39 = "arith.subi"(%0, %10) : (index, index) -> index | |
| %40 = "arith.select"(%38, %39, %10) : (i1, index, index) -> index | |
| %41 = "arith.divsi"(%40, %1) : (index, index) -> index | |
| %42 = "arith.subi"(%0, %41) : (index, index) -> index | |
| %43 = "arith.select"(%38, %42, %41) : (i1, index, index) -> index | |
| %44 = "arith.addi"(%37, %43) : (index, index) -> index | |
| "memref.store"(%32, %14, %44) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_3"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| %inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32> | |
| ^ | |
| /home/prashant/stable.mlir:804:27: error: failed to serialize executables | |
| %inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32> | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:804:27: note: see current operation: | |
| "hal.executable"() ({ | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
| %0 = "arith.constant"() {value = 5 : index} : () -> index | |
| %1 = "arith.constant"() {value = 2 : index} : () -> index | |
| %2 = "arith.constant"() {value = 1 : index} : () -> index | |
| "hal.return"(%0, %1, %2) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_3", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = -1 : index} : () -> index | |
| %1 = "arith.constant"() {value = 4 : index} : () -> index | |
| %2 = "arith.constant"() {value = 32 : index} : () -> index | |
| %3 = "arith.constant"() {value = 160 : index} : () -> index | |
| %4 = "arith.constant"() {value = 0 : index} : () -> index | |
| %5 = "arith.constant"() {value = 640 : index} : () -> index | |
| %6 = "arith.constant"() {value = 320 : index} : () -> index | |
| %7 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %8 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
| %9 = "arith.index_castui"(%7) : (i32) -> index | |
| %10 = "arith.index_castui"(%8) : (i32) -> index | |
| %11 = "hal.interface.binding.subspan"(%9, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %12 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %13 = "hal.interface.binding.subspan"(%10, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %14 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %15 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %16 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %17 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| %18 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
| %19 = "arith.muli"(%16, %3) : (index, index) -> index | |
| %20 = "arith.muli"(%18, %3) : (index, index) -> index | |
| %21 = "arith.addi"(%19, %20) : (index, index) -> index | |
| %22 = "arith.muli"(%15, %2) : (index, index) -> index | |
| %23 = "arith.addi"(%21, %22) : (index, index) -> index | |
| %24 = "arith.addi"(%23, %17) : (index, index) -> index | |
| %25 = "arith.cmpi"(%9, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %26 = "arith.subi"(%0, %9) : (index, index) -> index | |
| %27 = "arith.select"(%25, %26, %9) : (i1, index, index) -> index | |
| %28 = "arith.divsi"(%27, %1) : (index, index) -> index | |
| %29 = "arith.subi"(%0, %28) : (index, index) -> index | |
| %30 = "arith.select"(%25, %29, %28) : (i1, index, index) -> index | |
| %31 = "arith.addi"(%24, %30) : (index, index) -> index | |
| %32 = "memref.load"(%12, %31) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %33 = "arith.muli"(%16, %6) : (index, index) -> index | |
| %34 = "arith.muli"(%18, %6) : (index, index) -> index | |
| %35 = "arith.addi"(%33, %34) : (index, index) -> index | |
| %36 = "arith.addi"(%35, %22) : (index, index) -> index | |
| %37 = "arith.addi"(%36, %17) : (index, index) -> index | |
| %38 = "arith.cmpi"(%10, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %39 = "arith.subi"(%0, %10) : (index, index) -> index | |
| %40 = "arith.select"(%38, %39, %10) : (i1, index, index) -> index | |
| %41 = "arith.divsi"(%40, %1) : (index, index) -> index | |
| %42 = "arith.subi"(%0, %41) : (index, index) -> index | |
| %43 = "arith.select"(%38, %42, %41) : (i1, index, index) -> index | |
| %44 = "arith.addi"(%37, %43) : (index, index) -> index | |
| "memref.store"(%32, %14, %44) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_3"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| "hal.executable_end"() : () -> () | |
| }) {sym_name = "forward_dispatch_3", sym_visibility = "private"} : () -> () | |
| %inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32> | |
| ^ | |
| /home/prashant/stable.mlir:866:11: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
| %32 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:866:11: note: see current operation: %51 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %32 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:866:11: note: see existing live user here: %59 = "spirv.UConvert"(%51) : (i32) -> i64 | |
| /home/prashant/stable.mlir:866:11: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
| %32 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:866:11: note: see current operation: | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
| %0 = "arith.constant"() {value = 64 : index} : () -> index | |
| %1 = "arith.constant"() {value = 1 : index} : () -> index | |
| "hal.return"(%0, %1, %1) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_10_generic_64x92160", translation_info = #iree_codegen.translation_info<SPIRVSubgroupReduce>, workgroup_size = [512 : index, 1 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = 16 : index} : () -> index | |
| %1 = "arith.constant"() {value = 23040 : index} : () -> index | |
| %2 = "arith.constant"() {value = -1 : index} : () -> index | |
| %3 = "arith.constant"() {value = 4 : index} : () -> index | |
| %4 = "arith.constant"() {value = 0 : index} : () -> index | |
| %5 = "arith.constant"() {value = 64 : index} : () -> index | |
| %6 = "arith.constant"() {value = 1474560 : index} : () -> index | |
| %7 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
| %8 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
| %9 = "arith.constant"() {value = 32 : i32} : () -> i32 | |
| %10 = "arith.constant"() {value = 1 : i32} : () -> i32 | |
| %11 = "arith.constant"() {value = 2 : i32} : () -> i32 | |
| %12 = "arith.constant"() {value = 4 : i32} : () -> i32 | |
| %13 = "arith.constant"() {value = 8 : i32} : () -> i32 | |
| %14 = "arith.constant"() {value = 16 : i32} : () -> i32 | |
| %15 = "arith.constant"() {value = 32 : index} : () -> index | |
| %16 = "arith.constant"() {value = 15 : index} : () -> index | |
| %17 = "arith.constant"() {value = 0 : i32} : () -> i32 | |
| %18 = "arith.constant"() {value = 2048 : index} : () -> index | |
| %19 = "arith.constant"() {value = 92160 : index} : () -> index | |
| %20 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| %21 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %22 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
| %23 = "arith.index_castui"(%21) : (i32) -> index | |
| %24 = "arith.index_castui"(%22) : (i32) -> index | |
| %25 = "hal.interface.binding.subspan"(%23, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %26 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %27 = "hal.interface.binding.subspan"(%24, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %28 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %29 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %30 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %31 = "scf.for"(%4, %19, %18, %8) ({ | |
| ^bb0(%arg0: index, %arg1: vector<4xf32>): | |
| %70 = "arith.cmpi"(%arg0, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %71 = "arith.subi"(%2, %arg0) : (index, index) -> index | |
| %72 = "arith.select"(%70, %71, %arg0) : (i1, index, index) -> index | |
| %73 = "arith.divsi"(%72, %3) : (index, index) -> index | |
| %74 = "arith.subi"(%2, %73) : (index, index) -> index | |
| %75 = "arith.select"(%70, %74, %73) : (i1, index, index) -> index | |
| %76 = "arith.muli"(%30, %1) : (index, index) -> index | |
| %77 = "arith.addi"(%20, %76) : (index, index) -> index | |
| %78 = "arith.addi"(%75, %77) : (index, index) -> index | |
| %79 = "arith.cmpi"(%23, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %80 = "arith.subi"(%2, %23) : (index, index) -> index | |
| %81 = "arith.select"(%79, %80, %23) : (i1, index, index) -> index | |
| %82 = "arith.divsi"(%81, %0) : (index, index) -> index | |
| %83 = "arith.subi"(%2, %82) : (index, index) -> index | |
| %84 = "arith.select"(%79, %83, %82) : (i1, index, index) -> index | |
| %85 = "arith.addi"(%78, %84) : (index, index) -> index | |
| %86 = "memref.load"(%26, %85) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %87 = "arith.addf"(%86, %arg1) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| "scf.yield"(%87) : (vector<4xf32>) -> () | |
| }) : (index, index, index, vector<4xf32>) -> vector<4xf32> | |
| %32 = "arith.cmpi"(%24, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %33 = "arith.subi"(%2, %24) : (index, index) -> index | |
| %34 = "arith.select"(%32, %33, %24) : (i1, index, index) -> index | |
| %35 = "arith.divsi"(%34, %3) : (index, index) -> index | |
| %36 = "arith.subi"(%2, %35) : (index, index) -> index | |
| %37 = "arith.select"(%32, %36, %35) : (i1, index, index) -> index | |
| %38 = "arith.addi"(%30, %37) : (index, index) -> index | |
| %39 = "memref.load"(%28, %38) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %40 = "vector.insert"(%39, %7) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
| %41 = "vector.extractelement"(%40, %4) : (vector<1xf32>, index) -> f32 | |
| %42 = "vector.reduction"(%31) {kind = #vector.kind<add>} : (vector<4xf32>) -> f32 | |
| %43:2 = "gpu.shuffle"(%42, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %44 = "arith.addf"(%42, %43#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %45:2 = "gpu.shuffle"(%44, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %46 = "arith.addf"(%44, %45#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %47:2 = "gpu.shuffle"(%46, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %48 = "arith.addf"(%46, %47#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %49:2 = "gpu.shuffle"(%48, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %50 = "arith.addf"(%48, %49#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %51:2 = "gpu.shuffle"(%50, %14, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %52 = "arith.addf"(%50, %51#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %53 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<16xf32, #spirv.storage_class<Workgroup>> | |
| %54 = "arith.divui"(%20, %15) : (index, index) -> index | |
| %55 = "arith.remui"(%20, %15) : (index, index) -> index | |
| %56 = "arith.cmpi"(%55, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
| "scf.if"(%56) ({ | |
| "memref.store"(%52, %53, %54) {nontemporal = false} : (f32, memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> () | |
| "scf.yield"() : () -> () | |
| }, { | |
| }) : (i1) -> () | |
| "gpu.barrier"() : () -> () | |
| %57 = "arith.minui"(%55, %16) : (index, index) -> index | |
| %58 = "memref.load"(%53, %57) {nontemporal = false} : (memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> f32 | |
| %59:2 = "gpu.shuffle"(%58, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %60 = "arith.addf"(%58, %59#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %61:2 = "gpu.shuffle"(%60, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %62 = "arith.addf"(%60, %61#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %63:2 = "gpu.shuffle"(%62, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %64 = "arith.addf"(%62, %63#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %65:2 = "gpu.shuffle"(%64, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %66 = "arith.addf"(%64, %65#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %67:2 = "gpu.shuffle"(%66, %17, %9) {mode = #gpu<shuffle_mode idx>} : (f32, i32, i32) -> (f32, i1) | |
| %68 = "arith.addf"(%67#0, %41) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %69 = "arith.cmpi"(%20, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
| "scf.if"(%69) ({ | |
| "memref.store"(%68, %29, %38) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "scf.yield"() : () -> () | |
| }, { | |
| }) : (i1) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [512, 1, 1]>, sym_name = "forward_dispatch_10_generic_64x92160"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| %32 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:866:11: error: failed to serialize executables | |
| %32 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:866:11: note: see current operation: | |
| "hal.executable"() ({ | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
| %0 = "arith.constant"() {value = 64 : index} : () -> index | |
| %1 = "arith.constant"() {value = 1 : index} : () -> index | |
| "hal.return"(%0, %1, %1) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_10_generic_64x92160", translation_info = #iree_codegen.translation_info<SPIRVSubgroupReduce>, workgroup_size = [512 : index, 1 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = 16 : index} : () -> index | |
| %1 = "arith.constant"() {value = 23040 : index} : () -> index | |
| %2 = "arith.constant"() {value = -1 : index} : () -> index | |
| %3 = "arith.constant"() {value = 4 : index} : () -> index | |
| %4 = "arith.constant"() {value = 0 : index} : () -> index | |
| %5 = "arith.constant"() {value = 64 : index} : () -> index | |
| %6 = "arith.constant"() {value = 1474560 : index} : () -> index | |
| %7 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
| %8 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
| %9 = "arith.constant"() {value = 32 : i32} : () -> i32 | |
| %10 = "arith.constant"() {value = 1 : i32} : () -> i32 | |
| %11 = "arith.constant"() {value = 2 : i32} : () -> i32 | |
| %12 = "arith.constant"() {value = 4 : i32} : () -> i32 | |
| %13 = "arith.constant"() {value = 8 : i32} : () -> i32 | |
| %14 = "arith.constant"() {value = 16 : i32} : () -> i32 | |
| %15 = "arith.constant"() {value = 32 : index} : () -> index | |
| %16 = "arith.constant"() {value = 15 : index} : () -> index | |
| %17 = "arith.constant"() {value = 0 : i32} : () -> i32 | |
| %18 = "arith.constant"() {value = 2048 : index} : () -> index | |
| %19 = "arith.constant"() {value = 92160 : index} : () -> index | |
| %20 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| %21 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %22 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
| %23 = "arith.index_castui"(%21) : (i32) -> index | |
| %24 = "arith.index_castui"(%22) : (i32) -> index | |
| %25 = "hal.interface.binding.subspan"(%23, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %26 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %27 = "hal.interface.binding.subspan"(%24, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %28 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %29 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %30 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %31 = "scf.for"(%4, %19, %18, %8) ({ | |
| ^bb0(%arg0: index, %arg1: vector<4xf32>): | |
| %70 = "arith.cmpi"(%arg0, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %71 = "arith.subi"(%2, %arg0) : (index, index) -> index | |
| %72 = "arith.select"(%70, %71, %arg0) : (i1, index, index) -> index | |
| %73 = "arith.divsi"(%72, %3) : (index, index) -> index | |
| %74 = "arith.subi"(%2, %73) : (index, index) -> index | |
| %75 = "arith.select"(%70, %74, %73) : (i1, index, index) -> index | |
| %76 = "arith.muli"(%30, %1) : (index, index) -> index | |
| %77 = "arith.addi"(%20, %76) : (index, index) -> index | |
| %78 = "arith.addi"(%75, %77) : (index, index) -> index | |
| %79 = "arith.cmpi"(%23, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %80 = "arith.subi"(%2, %23) : (index, index) -> index | |
| %81 = "arith.select"(%79, %80, %23) : (i1, index, index) -> index | |
| %82 = "arith.divsi"(%81, %0) : (index, index) -> index | |
| %83 = "arith.subi"(%2, %82) : (index, index) -> index | |
| %84 = "arith.select"(%79, %83, %82) : (i1, index, index) -> index | |
| %85 = "arith.addi"(%78, %84) : (index, index) -> index | |
| %86 = "memref.load"(%26, %85) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %87 = "arith.addf"(%86, %arg1) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| "scf.yield"(%87) : (vector<4xf32>) -> () | |
| }) : (index, index, index, vector<4xf32>) -> vector<4xf32> | |
| %32 = "arith.cmpi"(%24, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %33 = "arith.subi"(%2, %24) : (index, index) -> index | |
| %34 = "arith.select"(%32, %33, %24) : (i1, index, index) -> index | |
| %35 = "arith.divsi"(%34, %3) : (index, index) -> index | |
| %36 = "arith.subi"(%2, %35) : (index, index) -> index | |
| %37 = "arith.select"(%32, %36, %35) : (i1, index, index) -> index | |
| %38 = "arith.addi"(%30, %37) : (index, index) -> index | |
| %39 = "memref.load"(%28, %38) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %40 = "vector.insert"(%39, %7) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
| %41 = "vector.extractelement"(%40, %4) : (vector<1xf32>, index) -> f32 | |
| %42 = "vector.reduction"(%31) {kind = #vector.kind<add>} : (vector<4xf32>) -> f32 | |
| %43:2 = "gpu.shuffle"(%42, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %44 = "arith.addf"(%42, %43#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %45:2 = "gpu.shuffle"(%44, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %46 = "arith.addf"(%44, %45#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %47:2 = "gpu.shuffle"(%46, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %48 = "arith.addf"(%46, %47#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %49:2 = "gpu.shuffle"(%48, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %50 = "arith.addf"(%48, %49#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %51:2 = "gpu.shuffle"(%50, %14, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %52 = "arith.addf"(%50, %51#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %53 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<16xf32, #spirv.storage_class<Workgroup>> | |
| %54 = "arith.divui"(%20, %15) : (index, index) -> index | |
| %55 = "arith.remui"(%20, %15) : (index, index) -> index | |
| %56 = "arith.cmpi"(%55, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
| "scf.if"(%56) ({ | |
| "memref.store"(%52, %53, %54) {nontemporal = false} : (f32, memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> () | |
| "scf.yield"() : () -> () | |
| }, { | |
| }) : (i1) -> () | |
| "gpu.barrier"() : () -> () | |
| %57 = "arith.minui"(%55, %16) : (index, index) -> index | |
| %58 = "memref.load"(%53, %57) {nontemporal = false} : (memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> f32 | |
| %59:2 = "gpu.shuffle"(%58, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %60 = "arith.addf"(%58, %59#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %61:2 = "gpu.shuffle"(%60, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %62 = "arith.addf"(%60, %61#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %63:2 = "gpu.shuffle"(%62, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %64 = "arith.addf"(%62, %63#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %65:2 = "gpu.shuffle"(%64, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %66 = "arith.addf"(%64, %65#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %67:2 = "gpu.shuffle"(%66, %17, %9) {mode = #gpu<shuffle_mode idx>} : (f32, i32, i32) -> (f32, i1) | |
| %68 = "arith.addf"(%67#0, %41) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %69 = "arith.cmpi"(%20, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
| "scf.if"(%69) ({ | |
| "memref.store"(%68, %29, %38) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "scf.yield"() : () -> () | |
| }, { | |
| }) : (i1) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [512, 1, 1]>, sym_name = "forward_dispatch_10_generic_64x92160"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| "hal.executable_end"() : () -> () | |
| }) {sym_name = "forward_dispatch_10", sym_visibility = "private"} : () -> () | |
| %32 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:876:11: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
| %34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:876:11: note: see current operation: %30 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:876:11: note: see existing live user here: %45 = "spirv.UConvert"(%30) : (i32) -> i64 | |
| /home/prashant/stable.mlir:876:11: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
| %34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:876:11: note: see current operation: | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
| %0 = "arith.constant"() {value = 72 : index} : () -> index | |
| %1 = "arith.constant"() {value = 10 : index} : () -> index | |
| %2 = "arith.constant"() {value = 64 : index} : () -> index | |
| "hal.return"(%0, %1, %2) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 3, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_11_generic_64x10x9216", translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0__0", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = 4 : index} : () -> index | |
| %1 = "arith.constant"() {value = -1 : index} : () -> index | |
| %2 = "arith.constant"() {value = 16 : index} : () -> index | |
| %3 = "arith.constant"() {value = 2304 : index} : () -> index | |
| %4 = "arith.constant"() {value = 23040 : index} : () -> index | |
| %5 = "arith.constant"() {value = 32 : index} : () -> index | |
| %6 = "arith.constant"() {value = 0 : index} : () -> index | |
| %7 = "arith.constant"() {value = 64 : index} : () -> index | |
| %8 = "arith.constant"() {value = 1474560 : index} : () -> index | |
| %9 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
| %10 = "arith.constant"() {value = dense<9.216000e+04> : vector<1xf32>} : () -> vector<1xf32> | |
| %11 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
| %12 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %13 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
| %14 = "hal.interface.constant.load"() {index = 2 : index} : () -> i32 | |
| %15 = "arith.index_castui"(%12) : (i32) -> index | |
| %16 = "arith.index_castui"(%13) : (i32) -> index | |
| %17 = "arith.index_castui"(%14) : (i32) -> index | |
| %18 = "hal.interface.binding.subspan"(%15, %8) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %19 = "hal.interface.binding.subspan"(%6, %8) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %20 = "hal.interface.binding.subspan"(%16, %7) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %21 = "hal.interface.binding.subspan"(%6, %7) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %22 = "hal.interface.binding.subspan"(%17, %8) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %23 = "hal.interface.binding.subspan"(%6, %8) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %24 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %25 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %26 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
| %27 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| %28 = "arith.muli"(%24, %5) : (index, index) -> index | |
| %29 = "arith.addi"(%27, %28) : (index, index) -> index | |
| %30 = "arith.muli"(%26, %4) : (index, index) -> index | |
| %31 = "arith.addi"(%29, %30) : (index, index) -> index | |
| %32 = "arith.muli"(%25, %3) : (index, index) -> index | |
| %33 = "arith.addi"(%31, %32) : (index, index) -> index | |
| %34 = "arith.cmpi"(%15, %6) {predicate = 2 : i64} : (index, index) -> i1 | |
| %35 = "arith.subi"(%1, %15) : (index, index) -> index | |
| %36 = "arith.select"(%34, %35, %15) : (i1, index, index) -> index | |
| %37 = "arith.divsi"(%36, %2) : (index, index) -> index | |
| %38 = "arith.subi"(%1, %37) : (index, index) -> index | |
| %39 = "arith.select"(%34, %38, %37) : (i1, index, index) -> index | |
| %40 = "arith.addi"(%33, %39) : (index, index) -> index | |
| %41 = "memref.load"(%19, %40) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %42 = "arith.cmpi"(%16, %6) {predicate = 2 : i64} : (index, index) -> i1 | |
| %43 = "arith.subi"(%1, %16) : (index, index) -> index | |
| %44 = "arith.select"(%42, %43, %16) : (i1, index, index) -> index | |
| %45 = "arith.divsi"(%44, %0) : (index, index) -> index | |
| %46 = "arith.subi"(%1, %45) : (index, index) -> index | |
| %47 = "arith.select"(%42, %46, %45) : (i1, index, index) -> index | |
| %48 = "arith.addi"(%26, %47) : (index, index) -> index | |
| %49 = "memref.load"(%21, %48) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %50 = "vector.insert"(%49, %9) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
| %51 = "arith.divf"(%50, %10) {fastmath = #arith.fastmath<none>} : (vector<1xf32>, vector<1xf32>) -> vector<1xf32> | |
| %52 = "vector.extract"(%51) {position = [0]} : (vector<1xf32>) -> f32 | |
| %53 = "vector.insert"(%52, %11) {position = [0]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %54 = "vector.insert"(%52, %53) {position = [1]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %55 = "vector.insert"(%52, %54) {position = [2]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %56 = "vector.insert"(%52, %55) {position = [3]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %57 = "arith.subf"(%41, %56) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %58 = "arith.cmpi"(%17, %6) {predicate = 2 : i64} : (index, index) -> i1 | |
| %59 = "arith.subi"(%1, %17) : (index, index) -> index | |
| %60 = "arith.select"(%58, %59, %17) : (i1, index, index) -> index | |
| %61 = "arith.divsi"(%60, %2) : (index, index) -> index | |
| %62 = "arith.subi"(%1, %61) : (index, index) -> index | |
| %63 = "arith.select"(%58, %62, %61) : (i1, index, index) -> index | |
| %64 = "arith.addi"(%33, %63) : (index, index) -> index | |
| "memref.store"(%57, %23, %64) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_11_generic_64x10x9216"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| %34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:876:11: error: failed to serialize executables | |
| %34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:876:11: note: see current operation: | |
| "hal.executable"() ({ | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
| %0 = "arith.constant"() {value = 72 : index} : () -> index | |
| %1 = "arith.constant"() {value = 10 : index} : () -> index | |
| %2 = "arith.constant"() {value = 64 : index} : () -> index | |
| "hal.return"(%0, %1, %2) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 3, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_11_generic_64x10x9216", translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0__0", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = 4 : index} : () -> index | |
| %1 = "arith.constant"() {value = -1 : index} : () -> index | |
| %2 = "arith.constant"() {value = 16 : index} : () -> index | |
| %3 = "arith.constant"() {value = 2304 : index} : () -> index | |
| %4 = "arith.constant"() {value = 23040 : index} : () -> index | |
| %5 = "arith.constant"() {value = 32 : index} : () -> index | |
| %6 = "arith.constant"() {value = 0 : index} : () -> index | |
| %7 = "arith.constant"() {value = 64 : index} : () -> index | |
| %8 = "arith.constant"() {value = 1474560 : index} : () -> index | |
| %9 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
| %10 = "arith.constant"() {value = dense<9.216000e+04> : vector<1xf32>} : () -> vector<1xf32> | |
| %11 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
| %12 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %13 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
| %14 = "hal.interface.constant.load"() {index = 2 : index} : () -> i32 | |
| %15 = "arith.index_castui"(%12) : (i32) -> index | |
| %16 = "arith.index_castui"(%13) : (i32) -> index | |
| %17 = "arith.index_castui"(%14) : (i32) -> index | |
| %18 = "hal.interface.binding.subspan"(%15, %8) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %19 = "hal.interface.binding.subspan"(%6, %8) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %20 = "hal.interface.binding.subspan"(%16, %7) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %21 = "hal.interface.binding.subspan"(%6, %7) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %22 = "hal.interface.binding.subspan"(%17, %8) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %23 = "hal.interface.binding.subspan"(%6, %8) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %24 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %25 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %26 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
| %27 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| %28 = "arith.muli"(%24, %5) : (index, index) -> index | |
| %29 = "arith.addi"(%27, %28) : (index, index) -> index | |
| %30 = "arith.muli"(%26, %4) : (index, index) -> index | |
| %31 = "arith.addi"(%29, %30) : (index, index) -> index | |
| %32 = "arith.muli"(%25, %3) : (index, index) -> index | |
| %33 = "arith.addi"(%31, %32) : (index, index) -> index | |
| %34 = "arith.cmpi"(%15, %6) {predicate = 2 : i64} : (index, index) -> i1 | |
| %35 = "arith.subi"(%1, %15) : (index, index) -> index | |
| %36 = "arith.select"(%34, %35, %15) : (i1, index, index) -> index | |
| %37 = "arith.divsi"(%36, %2) : (index, index) -> index | |
| %38 = "arith.subi"(%1, %37) : (index, index) -> index | |
| %39 = "arith.select"(%34, %38, %37) : (i1, index, index) -> index | |
| %40 = "arith.addi"(%33, %39) : (index, index) -> index | |
| %41 = "memref.load"(%19, %40) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %42 = "arith.cmpi"(%16, %6) {predicate = 2 : i64} : (index, index) -> i1 | |
| %43 = "arith.subi"(%1, %16) : (index, index) -> index | |
| %44 = "arith.select"(%42, %43, %16) : (i1, index, index) -> index | |
| %45 = "arith.divsi"(%44, %0) : (index, index) -> index | |
| %46 = "arith.subi"(%1, %45) : (index, index) -> index | |
| %47 = "arith.select"(%42, %46, %45) : (i1, index, index) -> index | |
| %48 = "arith.addi"(%26, %47) : (index, index) -> index | |
| %49 = "memref.load"(%21, %48) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %50 = "vector.insert"(%49, %9) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
| %51 = "arith.divf"(%50, %10) {fastmath = #arith.fastmath<none>} : (vector<1xf32>, vector<1xf32>) -> vector<1xf32> | |
| %52 = "vector.extract"(%51) {position = [0]} : (vector<1xf32>) -> f32 | |
| %53 = "vector.insert"(%52, %11) {position = [0]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %54 = "vector.insert"(%52, %53) {position = [1]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %55 = "vector.insert"(%52, %54) {position = [2]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %56 = "vector.insert"(%52, %55) {position = [3]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %57 = "arith.subf"(%41, %56) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %58 = "arith.cmpi"(%17, %6) {predicate = 2 : i64} : (index, index) -> i1 | |
| %59 = "arith.subi"(%1, %17) : (index, index) -> index | |
| %60 = "arith.select"(%58, %59, %17) : (i1, index, index) -> index | |
| %61 = "arith.divsi"(%60, %2) : (index, index) -> index | |
| %62 = "arith.subi"(%1, %61) : (index, index) -> index | |
| %63 = "arith.select"(%58, %62, %61) : (i1, index, index) -> index | |
| %64 = "arith.addi"(%33, %63) : (index, index) -> index | |
| "memref.store"(%57, %23, %64) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_11_generic_64x10x9216"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| "hal.executable_end"() : () -> () | |
| }) {sym_name = "forward_dispatch_11", sym_visibility = "private"} : () -> () | |
| %34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:886:11: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
| %36 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%35 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:886:11: note: see current operation: %51 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %36 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%35 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:886:11: note: see existing live user here: %59 = "spirv.UConvert"(%51) : (i32) -> i64 | |
| /home/prashant/stable.mlir:886:11: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
| %36 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%35 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:886:11: note: see current operation: | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
| %0 = "arith.constant"() {value = 64 : index} : () -> index | |
| %1 = "arith.constant"() {value = 1 : index} : () -> index | |
| "hal.return"(%0, %1, %1) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_12_generic_64x92160", translation_info = #iree_codegen.translation_info<SPIRVSubgroupReduce>, workgroup_size = [512 : index, 1 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = 16 : index} : () -> index | |
| %1 = "arith.constant"() {value = 23040 : index} : () -> index | |
| %2 = "arith.constant"() {value = -1 : index} : () -> index | |
| %3 = "arith.constant"() {value = 4 : index} : () -> index | |
| %4 = "arith.constant"() {value = 0 : index} : () -> index | |
| %5 = "arith.constant"() {value = 64 : index} : () -> index | |
| %6 = "arith.constant"() {value = 1474560 : index} : () -> index | |
| %7 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
| %8 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
| %9 = "arith.constant"() {value = 32 : i32} : () -> i32 | |
| %10 = "arith.constant"() {value = 1 : i32} : () -> i32 | |
| %11 = "arith.constant"() {value = 2 : i32} : () -> i32 | |
| %12 = "arith.constant"() {value = 4 : i32} : () -> i32 | |
| %13 = "arith.constant"() {value = 8 : i32} : () -> i32 | |
| %14 = "arith.constant"() {value = 16 : i32} : () -> i32 | |
| %15 = "arith.constant"() {value = 32 : index} : () -> index | |
| %16 = "arith.constant"() {value = 15 : index} : () -> index | |
| %17 = "arith.constant"() {value = 0 : i32} : () -> i32 | |
| %18 = "arith.constant"() {value = 2048 : index} : () -> index | |
| %19 = "arith.constant"() {value = 92160 : index} : () -> index | |
| %20 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| %21 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %22 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
| %23 = "arith.index_castui"(%21) : (i32) -> index | |
| %24 = "arith.index_castui"(%22) : (i32) -> index | |
| %25 = "hal.interface.binding.subspan"(%23, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %26 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %27 = "hal.interface.binding.subspan"(%24, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %28 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %29 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %30 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %31 = "scf.for"(%4, %19, %18, %8) ({ | |
| ^bb0(%arg0: index, %arg1: vector<4xf32>): | |
| %70 = "arith.cmpi"(%arg0, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %71 = "arith.subi"(%2, %arg0) : (index, index) -> index | |
| %72 = "arith.select"(%70, %71, %arg0) : (i1, index, index) -> index | |
| %73 = "arith.divsi"(%72, %3) : (index, index) -> index | |
| %74 = "arith.subi"(%2, %73) : (index, index) -> index | |
| %75 = "arith.select"(%70, %74, %73) : (i1, index, index) -> index | |
| %76 = "arith.muli"(%30, %1) : (index, index) -> index | |
| %77 = "arith.addi"(%20, %76) : (index, index) -> index | |
| %78 = "arith.addi"(%75, %77) : (index, index) -> index | |
| %79 = "arith.cmpi"(%23, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %80 = "arith.subi"(%2, %23) : (index, index) -> index | |
| %81 = "arith.select"(%79, %80, %23) : (i1, index, index) -> index | |
| %82 = "arith.divsi"(%81, %0) : (index, index) -> index | |
| %83 = "arith.subi"(%2, %82) : (index, index) -> index | |
| %84 = "arith.select"(%79, %83, %82) : (i1, index, index) -> index | |
| %85 = "arith.addi"(%78, %84) : (index, index) -> index | |
| %86 = "memref.load"(%26, %85) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %87 = "arith.mulf"(%86, %86) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %88 = "arith.addf"(%87, %arg1) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| "scf.yield"(%88) : (vector<4xf32>) -> () | |
| }) : (index, index, index, vector<4xf32>) -> vector<4xf32> | |
| %32 = "arith.cmpi"(%24, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %33 = "arith.subi"(%2, %24) : (index, index) -> index | |
| %34 = "arith.select"(%32, %33, %24) : (i1, index, index) -> index | |
| %35 = "arith.divsi"(%34, %3) : (index, index) -> index | |
| %36 = "arith.subi"(%2, %35) : (index, index) -> index | |
| %37 = "arith.select"(%32, %36, %35) : (i1, index, index) -> index | |
| %38 = "arith.addi"(%30, %37) : (index, index) -> index | |
| %39 = "memref.load"(%28, %38) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %40 = "vector.insert"(%39, %7) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
| %41 = "vector.extractelement"(%40, %4) : (vector<1xf32>, index) -> f32 | |
| %42 = "vector.reduction"(%31) {kind = #vector.kind<add>} : (vector<4xf32>) -> f32 | |
| %43:2 = "gpu.shuffle"(%42, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %44 = "arith.addf"(%42, %43#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %45:2 = "gpu.shuffle"(%44, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %46 = "arith.addf"(%44, %45#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %47:2 = "gpu.shuffle"(%46, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %48 = "arith.addf"(%46, %47#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %49:2 = "gpu.shuffle"(%48, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %50 = "arith.addf"(%48, %49#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %51:2 = "gpu.shuffle"(%50, %14, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %52 = "arith.addf"(%50, %51#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %53 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<16xf32, #spirv.storage_class<Workgroup>> | |
| %54 = "arith.divui"(%20, %15) : (index, index) -> index | |
| %55 = "arith.remui"(%20, %15) : (index, index) -> index | |
| %56 = "arith.cmpi"(%55, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
| "scf.if"(%56) ({ | |
| "memref.store"(%52, %53, %54) {nontemporal = false} : (f32, memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> () | |
| "scf.yield"() : () -> () | |
| }, { | |
| }) : (i1) -> () | |
| "gpu.barrier"() : () -> () | |
| %57 = "arith.minui"(%55, %16) : (index, index) -> index | |
| %58 = "memref.load"(%53, %57) {nontemporal = false} : (memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> f32 | |
| %59:2 = "gpu.shuffle"(%58, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %60 = "arith.addf"(%58, %59#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %61:2 = "gpu.shuffle"(%60, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %62 = "arith.addf"(%60, %61#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %63:2 = "gpu.shuffle"(%62, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %64 = "arith.addf"(%62, %63#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %65:2 = "gpu.shuffle"(%64, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %66 = "arith.addf"(%64, %65#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %67:2 = "gpu.shuffle"(%66, %17, %9) {mode = #gpu<shuffle_mode idx>} : (f32, i32, i32) -> (f32, i1) | |
| %68 = "arith.addf"(%67#0, %41) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %69 = "arith.cmpi"(%20, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
| "scf.if"(%69) ({ | |
| "memref.store"(%68, %29, %38) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "scf.yield"() : () -> () | |
| }, { | |
| }) : (i1) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [512, 1, 1]>, sym_name = "forward_dispatch_12_generic_64x92160"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| %36 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%35 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:886:11: error: failed to serialize executables | |
| %36 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%35 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:886:11: note: see current operation: | |
| "hal.executable"() ({ | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): | |
| %0 = "arith.constant"() {value = 64 : index} : () -> index | |
| %1 = "arith.constant"() {value = 1 : index} : () -> index | |
| "hal.return"(%0, %1, %1) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_12_generic_64x92160", translation_info = #iree_codegen.translation_info<SPIRVSubgroupReduce>, workgroup_size = [512 : index, 1 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = 16 : index} : () -> index | |
| %1 = "arith.constant"() {value = 23040 : index} : () -> index | |
| %2 = "arith.constant"() {value = -1 : index} : () -> index | |
| %3 = "arith.constant"() {value = 4 : index} : () -> index | |
| %4 = "arith.constant"() {value = 0 : index} : () -> index | |
| %5 = "arith.constant"() {value = 64 : index} : () -> index | |
| %6 = "arith.constant"() {value = 1474560 : index} : () -> index | |
| %7 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
| %8 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
| %9 = "arith.constant"() {value = 32 : i32} : () -> i32 | |
| %10 = "arith.constant"() {value = 1 : i32} : () -> i32 | |
| %11 = "arith.constant"() {value = 2 : i32} : () -> i32 | |
| %12 = "arith.constant"() {value = 4 : i32} : () -> i32 | |
| %13 = "arith.constant"() {value = 8 : i32} : () -> i32 | |
| %14 = "arith.constant"() {value = 16 : i32} : () -> i32 | |
| %15 = "arith.constant"() {value = 32 : index} : () -> index | |
| %16 = "arith.constant"() {value = 15 : index} : () -> index | |
| %17 = "arith.constant"() {value = 0 : i32} : () -> i32 | |
| %18 = "arith.constant"() {value = 2048 : index} : () -> index | |
| %19 = "arith.constant"() {value = 92160 : index} : () -> index | |
| %20 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| %21 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %22 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
| %23 = "arith.index_castui"(%21) : (i32) -> index | |
| %24 = "arith.index_castui"(%22) : (i32) -> index | |
| %25 = "hal.interface.binding.subspan"(%23, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %26 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %27 = "hal.interface.binding.subspan"(%24, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %28 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %29 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %30 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %31 = "scf.for"(%4, %19, %18, %8) ({ | |
| ^bb0(%arg0: index, %arg1: vector<4xf32>): | |
| %70 = "arith.cmpi"(%arg0, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %71 = "arith.subi"(%2, %arg0) : (index, index) -> index | |
| %72 = "arith.select"(%70, %71, %arg0) : (i1, index, index) -> index | |
| %73 = "arith.divsi"(%72, %3) : (index, index) -> index | |
| %74 = "arith.subi"(%2, %73) : (index, index) -> index | |
| %75 = "arith.select"(%70, %74, %73) : (i1, index, index) -> index | |
| %76 = "arith.muli"(%30, %1) : (index, index) -> index | |
| %77 = "arith.addi"(%20, %76) : (index, index) -> index | |
| %78 = "arith.addi"(%75, %77) : (index, index) -> index | |
| %79 = "arith.cmpi"(%23, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %80 = "arith.subi"(%2, %23) : (index, index) -> index | |
| %81 = "arith.select"(%79, %80, %23) : (i1, index, index) -> index | |
| %82 = "arith.divsi"(%81, %0) : (index, index) -> index | |
| %83 = "arith.subi"(%2, %82) : (index, index) -> index | |
| %84 = "arith.select"(%79, %83, %82) : (i1, index, index) -> index | |
| %85 = "arith.addi"(%78, %84) : (index, index) -> index | |
| %86 = "memref.load"(%26, %85) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %87 = "arith.mulf"(%86, %86) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %88 = "arith.addf"(%87, %arg1) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| "scf.yield"(%88) : (vector<4xf32>) -> () | |
| }) : (index, index, index, vector<4xf32>) -> vector<4xf32> | |
| %32 = "arith.cmpi"(%24, %4) {predicate = 2 : i64} : (index, index) -> i1 | |
| %33 = "arith.subi"(%2, %24) : (index, index) -> index | |
| %34 = "arith.select"(%32, %33, %24) : (i1, index, index) -> index | |
| %35 = "arith.divsi"(%34, %3) : (index, index) -> index | |
| %36 = "arith.subi"(%2, %35) : (index, index) -> index | |
| %37 = "arith.select"(%32, %36, %35) : (i1, index, index) -> index | |
| %38 = "arith.addi"(%30, %37) : (index, index) -> index | |
| %39 = "memref.load"(%28, %38) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %40 = "vector.insert"(%39, %7) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
| %41 = "vector.extractelement"(%40, %4) : (vector<1xf32>, index) -> f32 | |
| %42 = "vector.reduction"(%31) {kind = #vector.kind<add>} : (vector<4xf32>) -> f32 | |
| %43:2 = "gpu.shuffle"(%42, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %44 = "arith.addf"(%42, %43#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %45:2 = "gpu.shuffle"(%44, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %46 = "arith.addf"(%44, %45#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %47:2 = "gpu.shuffle"(%46, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %48 = "arith.addf"(%46, %47#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %49:2 = "gpu.shuffle"(%48, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %50 = "arith.addf"(%48, %49#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %51:2 = "gpu.shuffle"(%50, %14, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %52 = "arith.addf"(%50, %51#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %53 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<16xf32, #spirv.storage_class<Workgroup>> | |
| %54 = "arith.divui"(%20, %15) : (index, index) -> index | |
| %55 = "arith.remui"(%20, %15) : (index, index) -> index | |
| %56 = "arith.cmpi"(%55, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
| "scf.if"(%56) ({ | |
| "memref.store"(%52, %53, %54) {nontemporal = false} : (f32, memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> () | |
| "scf.yield"() : () -> () | |
| }, { | |
| }) : (i1) -> () | |
| "gpu.barrier"() : () -> () | |
| %57 = "arith.minui"(%55, %16) : (index, index) -> index | |
| %58 = "memref.load"(%53, %57) {nontemporal = false} : (memref<16xf32, #spirv.storage_class<Workgroup>>, index) -> f32 | |
| %59:2 = "gpu.shuffle"(%58, %10, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %60 = "arith.addf"(%58, %59#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %61:2 = "gpu.shuffle"(%60, %11, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %62 = "arith.addf"(%60, %61#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %63:2 = "gpu.shuffle"(%62, %12, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %64 = "arith.addf"(%62, %63#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %65:2 = "gpu.shuffle"(%64, %13, %9) {mode = #gpu<shuffle_mode xor>} : (f32, i32, i32) -> (f32, i1) | |
| %66 = "arith.addf"(%64, %65#0) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %67:2 = "gpu.shuffle"(%66, %17, %9) {mode = #gpu<shuffle_mode idx>} : (f32, i32, i32) -> (f32, i1) | |
| %68 = "arith.addf"(%67#0, %41) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32 | |
| %69 = "arith.cmpi"(%20, %4) {predicate = 0 : i64} : (index, index) -> i1 | |
| "scf.if"(%69) ({ | |
| "memref.store"(%68, %29, %38) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "scf.yield"() : () -> () | |
| }, { | |
| }) : (i1) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [512, 1, 1]>, sym_name = "forward_dispatch_12_generic_64x92160"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| "hal.executable_end"() : () -> () | |
| }) {sym_name = "forward_dispatch_12", sym_visibility = "private"} : () -> () | |
| %36 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%35 : tensor<2x32x10x9216xf64>) outs(%31 : tensor<2x32x1x1xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:876:11: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
| %34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:876:11: note: see current operation: %30 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %34 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%29, %33 : tensor<2x32x10x9216xf64>, tensor<2x32x1x1xf64>) outs(%28 : tensor<2x32x10x9216xf64>) { | |
| ^ | |
| /home/prashant/stable.mlir:876:11: note: see existing live user here: %38 = "spirv.UConvert"(%30) : (i32) -> i64 | |
| /home/prashant/stable.mlir:930:11: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
| %47 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%46, %44 : tensor<2x32x10x9216xf32>, tensor<2x32x1x1xf32>) outs(%45 : tensor<2x32x10x9216xf32>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:930:11: note: see current operation: | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
| %0 = "arith.constant"() {value = 72 : index} : () -> index | |
| %1 = "arith.constant"() {value = 10 : index} : () -> index | |
| %2 = "arith.constant"() {value = 64 : index} : () -> index | |
| "hal.return"(%0, %1, %2) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_13_generic_64x10x9216", translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0__0", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = -1 : index} : () -> index | |
| %1 = "arith.constant"() {value = 16 : index} : () -> index | |
| %2 = "arith.constant"() {value = 2304 : index} : () -> index | |
| %3 = "arith.constant"() {value = 23040 : index} : () -> index | |
| %4 = "arith.constant"() {value = 32 : index} : () -> index | |
| %5 = "arith.constant"() {value = 0 : index} : () -> index | |
| %6 = "arith.constant"() {value = 64 : index} : () -> index | |
| %7 = "arith.constant"() {value = 1474560 : index} : () -> index | |
| %8 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
| %9 = "arith.constant"() {value = dense<9.216000e+04> : vector<1xf32>} : () -> vector<1xf32> | |
| %10 = "arith.constant"() {value = dense<9.99999974E-6> : vector<1xf32>} : () -> vector<1xf32> | |
| %11 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
| %12 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %13 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
| %14 = "arith.index_castui"(%12) : (i32) -> index | |
| %15 = "arith.index_castui"(%13) : (i32) -> index | |
| %16 = "hal.interface.binding.subspan"(%14, %7) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %17 = "hal.interface.binding.subspan"(%5, %7) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %18 = "hal.interface.binding.subspan"(%5, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %19 = "hal.interface.binding.subspan"(%15, %7) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %20 = "hal.interface.binding.subspan"(%5, %7) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %21 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %22 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %23 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
| %24 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| %25 = "arith.muli"(%21, %4) : (index, index) -> index | |
| %26 = "arith.addi"(%24, %25) : (index, index) -> index | |
| %27 = "arith.muli"(%23, %3) : (index, index) -> index | |
| %28 = "arith.addi"(%26, %27) : (index, index) -> index | |
| %29 = "arith.muli"(%22, %2) : (index, index) -> index | |
| %30 = "arith.addi"(%28, %29) : (index, index) -> index | |
| %31 = "arith.cmpi"(%14, %5) {predicate = 2 : i64} : (index, index) -> i1 | |
| %32 = "arith.subi"(%0, %14) : (index, index) -> index | |
| %33 = "arith.select"(%31, %32, %14) : (i1, index, index) -> index | |
| %34 = "arith.divsi"(%33, %1) : (index, index) -> index | |
| %35 = "arith.subi"(%0, %34) : (index, index) -> index | |
| %36 = "arith.select"(%31, %35, %34) : (i1, index, index) -> index | |
| %37 = "arith.addi"(%30, %36) : (index, index) -> index | |
| %38 = "memref.load"(%17, %37) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %39 = "memref.load"(%18, %23) {nontemporal = false} : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %40 = "vector.insert"(%39, %8) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
| %41 = "arith.divf"(%40, %9) {fastmath = #arith.fastmath<none>} : (vector<1xf32>, vector<1xf32>) -> vector<1xf32> | |
| %42 = "arith.addf"(%41, %10) {fastmath = #arith.fastmath<none>} : (vector<1xf32>, vector<1xf32>) -> vector<1xf32> | |
| %43 = "math.rsqrt"(%42) {fastmath = #arith.fastmath<none>} : (vector<1xf32>) -> vector<1xf32> | |
| %44 = "vector.extract"(%43) {position = [0]} : (vector<1xf32>) -> f32 | |
| %45 = "vector.insert"(%44, %11) {position = [0]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %46 = "vector.insert"(%44, %45) {position = [1]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %47 = "vector.insert"(%44, %46) {position = [2]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %48 = "vector.insert"(%44, %47) {position = [3]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %49 = "arith.mulf"(%38, %48) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %50 = "arith.cmpi"(%15, %5) {predicate = 2 : i64} : (index, index) -> i1 | |
| %51 = "arith.subi"(%0, %15) : (index, index) -> index | |
| %52 = "arith.select"(%50, %51, %15) : (i1, index, index) -> index | |
| %53 = "arith.divsi"(%52, %1) : (index, index) -> index | |
| %54 = "arith.subi"(%0, %53) : (index, index) -> index | |
| %55 = "arith.select"(%50, %54, %53) : (i1, index, index) -> index | |
| %56 = "arith.addi"(%30, %55) : (index, index) -> index | |
| "memref.store"(%49, %20, %56) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_13_generic_64x10x9216"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| %47 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%46, %44 : tensor<2x32x10x9216xf32>, tensor<2x32x1x1xf32>) outs(%45 : tensor<2x32x10x9216xf32>) { | |
| ^ | |
| /home/prashant/stable.mlir:930:11: error: failed to serialize executables | |
| %47 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%46, %44 : tensor<2x32x10x9216xf32>, tensor<2x32x1x1xf32>) outs(%45 : tensor<2x32x10x9216xf32>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:930:11: note: see current operation: | |
| "hal.executable"() ({ | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
| %0 = "arith.constant"() {value = 72 : index} : () -> index | |
| %1 = "arith.constant"() {value = 10 : index} : () -> index | |
| %2 = "arith.constant"() {value = 64 : index} : () -> index | |
| "hal.return"(%0, %1, %2) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_13_generic_64x10x9216", translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0__0", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {aliased, binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = -1 : index} : () -> index | |
| %1 = "arith.constant"() {value = 16 : index} : () -> index | |
| %2 = "arith.constant"() {value = 2304 : index} : () -> index | |
| %3 = "arith.constant"() {value = 23040 : index} : () -> index | |
| %4 = "arith.constant"() {value = 32 : index} : () -> index | |
| %5 = "arith.constant"() {value = 0 : index} : () -> index | |
| %6 = "arith.constant"() {value = 64 : index} : () -> index | |
| %7 = "arith.constant"() {value = 1474560 : index} : () -> index | |
| %8 = "arith.constant"() {value = dense<0.000000e+00> : vector<1xf32>} : () -> vector<1xf32> | |
| %9 = "arith.constant"() {value = dense<9.216000e+04> : vector<1xf32>} : () -> vector<1xf32> | |
| %10 = "arith.constant"() {value = dense<9.99999974E-6> : vector<1xf32>} : () -> vector<1xf32> | |
| %11 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
| %12 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %13 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
| %14 = "arith.index_castui"(%12) : (i32) -> index | |
| %15 = "arith.index_castui"(%13) : (i32) -> index | |
| %16 = "hal.interface.binding.subspan"(%14, %7) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %17 = "hal.interface.binding.subspan"(%5, %7) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %18 = "hal.interface.binding.subspan"(%5, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %19 = "hal.interface.binding.subspan"(%15, %7) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %20 = "hal.interface.binding.subspan"(%5, %7) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %21 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %22 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %23 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
| %24 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| %25 = "arith.muli"(%21, %4) : (index, index) -> index | |
| %26 = "arith.addi"(%24, %25) : (index, index) -> index | |
| %27 = "arith.muli"(%23, %3) : (index, index) -> index | |
| %28 = "arith.addi"(%26, %27) : (index, index) -> index | |
| %29 = "arith.muli"(%22, %2) : (index, index) -> index | |
| %30 = "arith.addi"(%28, %29) : (index, index) -> index | |
| %31 = "arith.cmpi"(%14, %5) {predicate = 2 : i64} : (index, index) -> i1 | |
| %32 = "arith.subi"(%0, %14) : (index, index) -> index | |
| %33 = "arith.select"(%31, %32, %14) : (i1, index, index) -> index | |
| %34 = "arith.divsi"(%33, %1) : (index, index) -> index | |
| %35 = "arith.subi"(%0, %34) : (index, index) -> index | |
| %36 = "arith.select"(%31, %35, %34) : (i1, index, index) -> index | |
| %37 = "arith.addi"(%30, %36) : (index, index) -> index | |
| %38 = "memref.load"(%17, %37) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %39 = "memref.load"(%18, %23) {nontemporal = false} : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %40 = "vector.insert"(%39, %8) {position = [0]} : (f32, vector<1xf32>) -> vector<1xf32> | |
| %41 = "arith.divf"(%40, %9) {fastmath = #arith.fastmath<none>} : (vector<1xf32>, vector<1xf32>) -> vector<1xf32> | |
| %42 = "arith.addf"(%41, %10) {fastmath = #arith.fastmath<none>} : (vector<1xf32>, vector<1xf32>) -> vector<1xf32> | |
| %43 = "math.rsqrt"(%42) {fastmath = #arith.fastmath<none>} : (vector<1xf32>) -> vector<1xf32> | |
| %44 = "vector.extract"(%43) {position = [0]} : (vector<1xf32>) -> f32 | |
| %45 = "vector.insert"(%44, %11) {position = [0]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %46 = "vector.insert"(%44, %45) {position = [1]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %47 = "vector.insert"(%44, %46) {position = [2]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %48 = "vector.insert"(%44, %47) {position = [3]} : (f32, vector<4xf32>) -> vector<4xf32> | |
| %49 = "arith.mulf"(%38, %48) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %50 = "arith.cmpi"(%15, %5) {predicate = 2 : i64} : (index, index) -> i1 | |
| %51 = "arith.subi"(%0, %15) : (index, index) -> index | |
| %52 = "arith.select"(%50, %51, %15) : (i1, index, index) -> index | |
| %53 = "arith.divsi"(%52, %1) : (index, index) -> index | |
| %54 = "arith.subi"(%0, %53) : (index, index) -> index | |
| %55 = "arith.select"(%50, %54, %53) : (i1, index, index) -> index | |
| %56 = "arith.addi"(%30, %55) : (index, index) -> index | |
| "memref.store"(%49, %20, %56) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_13_generic_64x10x9216"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| "hal.executable_end"() : () -> () | |
| }) {sym_name = "forward_dispatch_13", sym_visibility = "private"} : () -> () | |
| %47 = linalg.generic {indexing_maps = [#map8, #map9, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%46, %44 : tensor<2x32x10x9216xf32>, tensor<2x32x1x1xf32>) outs(%45 : tensor<2x32x10x9216xf32>) { | |
| ^ | |
| /home/prashant/stable.mlir:936:21: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
| %expanded_730 = tensor.expand_shape %collapsed_729 [[0], [1], [2, 3]] : tensor<2x320x9216xf32> into tensor<2x320x96x96xf32> | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:936:21: note: see current operation: %58 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %expanded_730 = tensor.expand_shape %collapsed_729 [[0], [1], [2, 3]] : tensor<2x320x9216xf32> into tensor<2x320x96x96xf32> | |
| ^ | |
| /home/prashant/stable.mlir:936:21: note: see existing live user here: %80 = "spirv.UConvert"(%58) : (i32) -> i64 | |
| /home/prashant/stable.mlir:957:11: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
| %51 = linalg.generic {indexing_maps = [#map8, #map8, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %49 : tensor<2x320x96x96xf32>, tensor<2x320x96x96xf32>) outs(%25 : tensor<2x320x96x96xf32>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:957:11: note: see current operation: | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
| %0 = "arith.constant"() {value = 3 : index} : () -> index | |
| %1 = "arith.constant"() {value = 24 : index} : () -> index | |
| %2 = "arith.constant"() {value = 640 : index} : () -> index | |
| "hal.return"(%0, %1, %2) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_14_generic_2x320x96x96", translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [8 : index, 4 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 2 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_2_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = 4 : index} : () -> index | |
| %1 = "arith.constant"() {value = -1 : index} : () -> index | |
| %2 = "arith.constant"() {value = 16 : index} : () -> index | |
| %3 = "arith.constant"() {value = 8 : index} : () -> index | |
| %4 = "arith.constant"() {value = 24 : index} : () -> index | |
| %5 = "arith.constant"() {value = 96 : index} : () -> index | |
| %6 = "arith.constant"() {value = 2304 : index} : () -> index | |
| %7 = "arith.constant"() {value = 737280 : index} : () -> index | |
| %8 = "arith.constant"() {value = 0 : index} : () -> index | |
| %9 = "arith.constant"() {value = 1474560 : index} : () -> index | |
| %10 = "arith.constant"() {value = dense<0.693147182> : vector<4xf32>} : () -> vector<4xf32> | |
| %11 = "arith.constant"() {value = dense<1.44269502> : vector<4xf32>} : () -> vector<4xf32> | |
| %12 = "arith.constant"() {value = dense<0.499705136> : vector<4xf32>} : () -> vector<4xf32> | |
| %13 = "arith.constant"() {value = dense<0.168738902> : vector<4xf32>} : () -> vector<4xf32> | |
| %14 = "arith.constant"() {value = dense<0.0366896503> : vector<4xf32>} : () -> vector<4xf32> | |
| %15 = "arith.constant"() {value = dense<1.314350e-02> : vector<4xf32>} : () -> vector<4xf32> | |
| %16 = "arith.constant"() {value = dense<23> : vector<4xi32>} : () -> vector<4xi32> | |
| %17 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
| %18 = "arith.constant"() {value = dense<0x7F800000> : vector<4xf32>} : () -> vector<4xf32> | |
| %19 = "arith.constant"() {value = dense<0xFF800000> : vector<4xf32>} : () -> vector<4xf32> | |
| %20 = "arith.constant"() {value = dense<1.17549435E-38> : vector<4xf32>} : () -> vector<4xf32> | |
| %21 = "arith.constant"() {value = dense<127> : vector<4xi32>} : () -> vector<4xi32> | |
| %22 = "arith.constant"() {value = dense<-127> : vector<4xi32>} : () -> vector<4xi32> | |
| %23 = "arith.constant"() {value = dense<1.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
| %24 = "arith.constant"() {value = 320 : index} : () -> index | |
| %25 = "arith.constant"() {value = 2 : index} : () -> index | |
| %26 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %27 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
| %28 = "hal.interface.constant.load"() {index = 2 : index} : () -> i32 | |
| %29 = "hal.interface.constant.load"() {index = 3 : index} : () -> i32 | |
| %30 = "arith.index_castui"(%26) : (i32) -> index | |
| %31 = "arith.index_castui"(%27) : (i32) -> index | |
| %32 = "arith.index_castui"(%28) : (i32) -> index | |
| %33 = "arith.index_castui"(%29) : (i32) -> index | |
| %34 = "hal.interface.binding.subspan"(%30, %9) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %35 = "hal.interface.binding.subspan"(%8, %9) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %36 = "hal.interface.binding.subspan"(%31, %24) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %37 = "hal.interface.binding.subspan"(%8, %24) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %38 = "hal.interface.binding.subspan"(%32, %24) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %39 = "hal.interface.binding.subspan"(%8, %24) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %40 = "hal.interface.binding.subspan"(%33, %9) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %41 = "hal.interface.binding.subspan"(%8, %9) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %42 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %43 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %44 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
| %45 = "arith.remui"(%44, %24) : (index, index) -> index | |
| %46 = "arith.divui"(%44, %24) : (index, index) -> index | |
| %47 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
| %48 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| "scf.for"(%46, %25, %25) ({ | |
| ^bb0(%arg0: index): | |
| "scf.for"(%45, %24, %24) ({ | |
| ^bb0(%arg1: index): | |
| %49 = "arith.muli"(%arg0, %7) : (index, index) -> index | |
| %50 = "arith.muli"(%arg1, %6) : (index, index) -> index | |
| %51 = "arith.addi"(%49, %50) : (index, index) -> index | |
| %52 = "arith.muli"(%43, %5) : (index, index) -> index | |
| %53 = "arith.addi"(%51, %52) : (index, index) -> index | |
| %54 = "arith.muli"(%47, %4) : (index, index) -> index | |
| %55 = "arith.addi"(%53, %54) : (index, index) -> index | |
| %56 = "arith.addi"(%55, %48) : (index, index) -> index | |
| %57 = "arith.muli"(%42, %3) : (index, index) -> index | |
| %58 = "arith.addi"(%56, %57) : (index, index) -> index | |
| %59 = "arith.cmpi"(%30, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
| %60 = "arith.subi"(%1, %30) : (index, index) -> index | |
| %61 = "arith.select"(%59, %60, %30) : (i1, index, index) -> index | |
| %62 = "arith.divsi"(%61, %2) : (index, index) -> index | |
| %63 = "arith.subi"(%1, %62) : (index, index) -> index | |
| %64 = "arith.select"(%59, %63, %62) : (i1, index, index) -> index | |
| %65 = "arith.addi"(%58, %64) : (index, index) -> index | |
| %66 = "memref.load"(%35, %65) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %67 = "arith.cmpi"(%31, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
| %68 = "arith.subi"(%1, %31) : (index, index) -> index | |
| %69 = "arith.select"(%67, %68, %31) : (i1, index, index) -> index | |
| %70 = "arith.divsi"(%69, %0) : (index, index) -> index | |
| %71 = "arith.subi"(%1, %70) : (index, index) -> index | |
| %72 = "arith.select"(%67, %71, %70) : (i1, index, index) -> index | |
| %73 = "arith.addi"(%arg1, %72) : (index, index) -> index | |
| %74 = "memref.load"(%37, %73) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %75 = "arith.cmpi"(%32, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
| %76 = "arith.subi"(%1, %32) : (index, index) -> index | |
| %77 = "arith.select"(%75, %76, %32) : (i1, index, index) -> index | |
| %78 = "arith.divsi"(%77, %0) : (index, index) -> index | |
| %79 = "arith.subi"(%1, %78) : (index, index) -> index | |
| %80 = "arith.select"(%75, %79, %78) : (i1, index, index) -> index | |
| %81 = "arith.addi"(%arg1, %80) : (index, index) -> index | |
| %82 = "memref.load"(%39, %81) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %83 = "vector.splat"(%74) : (f32) -> vector<4xf32> | |
| %84 = "arith.mulf"(%66, %83) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %85 = "vector.splat"(%82) : (f32) -> vector<4xf32> | |
| %86 = "arith.addf"(%84, %85) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %87 = "arith.negf"(%86) {fastmath = #arith.fastmath<none>} : (vector<4xf32>) -> vector<4xf32> | |
| %88 = "arith.cmpf"(%87, %87) {predicate = 14 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
| %89 = "arith.mulf"(%87, %11) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %90 = "math.floor"(%89) {fastmath = #arith.fastmath<none>} : (vector<4xf32>) -> vector<4xf32> | |
| %91 = "arith.mulf"(%90, %10) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %92 = "arith.subf"(%87, %91) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %93 = "arith.mulf"(%92, %92) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %94 = "arith.mulf"(%93, %93) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %95 = "math.fma"(%23, %92, %23) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %96 = "math.fma"(%13, %92, %12) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %97 = "math.fma"(%15, %92, %14) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %98 = "math.fma"(%96, %93, %95) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %99 = "math.fma"(%97, %94, %98) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %100 = "arith.fptosi"(%90) : (vector<4xf32>) -> vector<4xi32> | |
| %101 = "arith.addi"(%100, %21) : (vector<4xi32>, vector<4xi32>) -> vector<4xi32> | |
| %102 = "arith.shli"(%101, %16) : (vector<4xi32>, vector<4xi32>) -> vector<4xi32> | |
| %103 = "arith.bitcast"(%102) : (vector<4xi32>) -> vector<4xf32> | |
| %104 = "arith.mulf"(%99, %103) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %105 = "arith.cmpi"(%100, %21) {predicate = 3 : i64} : (vector<4xi32>, vector<4xi32>) -> vector<4xi1> | |
| %106 = "arith.cmpi"(%100, %22) {predicate = 5 : i64} : (vector<4xi32>, vector<4xi32>) -> vector<4xi1> | |
| %107 = "arith.cmpf"(%87, %19) {predicate = 1 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
| %108 = "arith.cmpf"(%87, %18) {predicate = 1 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
| %109 = "arith.cmpf"(%87, %17) {predicate = 2 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
| %110 = "arith.andi"(%105, %106) : (vector<4xi1>, vector<4xi1>) -> vector<4xi1> | |
| %111 = "arith.select"(%109, %18, %20) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %112 = "arith.select"(%110, %104, %111) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %113 = "arith.select"(%108, %18, %112) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %114 = "arith.select"(%107, %17, %113) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %115 = "arith.select"(%88, %87, %114) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %116 = "arith.addf"(%115, %23) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %117 = "arith.divf"(%23, %116) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %118 = "arith.mulf"(%117, %86) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %119 = "arith.cmpi"(%33, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
| %120 = "arith.subi"(%1, %33) : (index, index) -> index | |
| %121 = "arith.select"(%119, %120, %33) : (i1, index, index) -> index | |
| %122 = "arith.divsi"(%121, %2) : (index, index) -> index | |
| %123 = "arith.subi"(%1, %122) : (index, index) -> index | |
| %124 = "arith.select"(%119, %123, %122) : (i1, index, index) -> index | |
| %125 = "arith.addi"(%58, %124) : (index, index) -> index | |
| "memref.store"(%118, %41, %125) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [8, 4, 1]>, sym_name = "forward_dispatch_14_generic_2x320x96x96"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| %51 = linalg.generic {indexing_maps = [#map8, #map8, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %49 : tensor<2x320x96x96xf32>, tensor<2x320x96x96xf32>) outs(%25 : tensor<2x320x96x96xf32>) { | |
| ^ | |
| /home/prashant/stable.mlir:957:11: error: failed to serialize executables | |
| %51 = linalg.generic {indexing_maps = [#map8, #map8, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %49 : tensor<2x320x96x96xf32>, tensor<2x320x96x96xf32>) outs(%25 : tensor<2x320x96x96xf32>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:957:11: note: see current operation: | |
| "hal.executable"() ({ | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
| %0 = "arith.constant"() {value = 3 : index} : () -> index | |
| %1 = "arith.constant"() {value = 24 : index} : () -> index | |
| %2 = "arith.constant"() {value = 640 : index} : () -> index | |
| "hal.return"(%0, %1, %2) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 4, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_14_generic_2x320x96x96", translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [8 : index, 4 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 2 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_2_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = 4 : index} : () -> index | |
| %1 = "arith.constant"() {value = -1 : index} : () -> index | |
| %2 = "arith.constant"() {value = 16 : index} : () -> index | |
| %3 = "arith.constant"() {value = 8 : index} : () -> index | |
| %4 = "arith.constant"() {value = 24 : index} : () -> index | |
| %5 = "arith.constant"() {value = 96 : index} : () -> index | |
| %6 = "arith.constant"() {value = 2304 : index} : () -> index | |
| %7 = "arith.constant"() {value = 737280 : index} : () -> index | |
| %8 = "arith.constant"() {value = 0 : index} : () -> index | |
| %9 = "arith.constant"() {value = 1474560 : index} : () -> index | |
| %10 = "arith.constant"() {value = dense<0.693147182> : vector<4xf32>} : () -> vector<4xf32> | |
| %11 = "arith.constant"() {value = dense<1.44269502> : vector<4xf32>} : () -> vector<4xf32> | |
| %12 = "arith.constant"() {value = dense<0.499705136> : vector<4xf32>} : () -> vector<4xf32> | |
| %13 = "arith.constant"() {value = dense<0.168738902> : vector<4xf32>} : () -> vector<4xf32> | |
| %14 = "arith.constant"() {value = dense<0.0366896503> : vector<4xf32>} : () -> vector<4xf32> | |
| %15 = "arith.constant"() {value = dense<1.314350e-02> : vector<4xf32>} : () -> vector<4xf32> | |
| %16 = "arith.constant"() {value = dense<23> : vector<4xi32>} : () -> vector<4xi32> | |
| %17 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
| %18 = "arith.constant"() {value = dense<0x7F800000> : vector<4xf32>} : () -> vector<4xf32> | |
| %19 = "arith.constant"() {value = dense<0xFF800000> : vector<4xf32>} : () -> vector<4xf32> | |
| %20 = "arith.constant"() {value = dense<1.17549435E-38> : vector<4xf32>} : () -> vector<4xf32> | |
| %21 = "arith.constant"() {value = dense<127> : vector<4xi32>} : () -> vector<4xi32> | |
| %22 = "arith.constant"() {value = dense<-127> : vector<4xi32>} : () -> vector<4xi32> | |
| %23 = "arith.constant"() {value = dense<1.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
| %24 = "arith.constant"() {value = 320 : index} : () -> index | |
| %25 = "arith.constant"() {value = 2 : index} : () -> index | |
| %26 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %27 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
| %28 = "hal.interface.constant.load"() {index = 2 : index} : () -> i32 | |
| %29 = "hal.interface.constant.load"() {index = 3 : index} : () -> i32 | |
| %30 = "arith.index_castui"(%26) : (i32) -> index | |
| %31 = "arith.index_castui"(%27) : (i32) -> index | |
| %32 = "arith.index_castui"(%28) : (i32) -> index | |
| %33 = "arith.index_castui"(%29) : (i32) -> index | |
| %34 = "hal.interface.binding.subspan"(%30, %9) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %35 = "hal.interface.binding.subspan"(%8, %9) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %36 = "hal.interface.binding.subspan"(%31, %24) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %37 = "hal.interface.binding.subspan"(%8, %24) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %38 = "hal.interface.binding.subspan"(%32, %24) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %39 = "hal.interface.binding.subspan"(%8, %24) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %40 = "hal.interface.binding.subspan"(%33, %9) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %41 = "hal.interface.binding.subspan"(%8, %9) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %42 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %43 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %44 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
| %45 = "arith.remui"(%44, %24) : (index, index) -> index | |
| %46 = "arith.divui"(%44, %24) : (index, index) -> index | |
| %47 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
| %48 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| "scf.for"(%46, %25, %25) ({ | |
| ^bb0(%arg0: index): | |
| "scf.for"(%45, %24, %24) ({ | |
| ^bb0(%arg1: index): | |
| %49 = "arith.muli"(%arg0, %7) : (index, index) -> index | |
| %50 = "arith.muli"(%arg1, %6) : (index, index) -> index | |
| %51 = "arith.addi"(%49, %50) : (index, index) -> index | |
| %52 = "arith.muli"(%43, %5) : (index, index) -> index | |
| %53 = "arith.addi"(%51, %52) : (index, index) -> index | |
| %54 = "arith.muli"(%47, %4) : (index, index) -> index | |
| %55 = "arith.addi"(%53, %54) : (index, index) -> index | |
| %56 = "arith.addi"(%55, %48) : (index, index) -> index | |
| %57 = "arith.muli"(%42, %3) : (index, index) -> index | |
| %58 = "arith.addi"(%56, %57) : (index, index) -> index | |
| %59 = "arith.cmpi"(%30, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
| %60 = "arith.subi"(%1, %30) : (index, index) -> index | |
| %61 = "arith.select"(%59, %60, %30) : (i1, index, index) -> index | |
| %62 = "arith.divsi"(%61, %2) : (index, index) -> index | |
| %63 = "arith.subi"(%1, %62) : (index, index) -> index | |
| %64 = "arith.select"(%59, %63, %62) : (i1, index, index) -> index | |
| %65 = "arith.addi"(%58, %64) : (index, index) -> index | |
| %66 = "memref.load"(%35, %65) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %67 = "arith.cmpi"(%31, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
| %68 = "arith.subi"(%1, %31) : (index, index) -> index | |
| %69 = "arith.select"(%67, %68, %31) : (i1, index, index) -> index | |
| %70 = "arith.divsi"(%69, %0) : (index, index) -> index | |
| %71 = "arith.subi"(%1, %70) : (index, index) -> index | |
| %72 = "arith.select"(%67, %71, %70) : (i1, index, index) -> index | |
| %73 = "arith.addi"(%arg1, %72) : (index, index) -> index | |
| %74 = "memref.load"(%37, %73) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %75 = "arith.cmpi"(%32, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
| %76 = "arith.subi"(%1, %32) : (index, index) -> index | |
| %77 = "arith.select"(%75, %76, %32) : (i1, index, index) -> index | |
| %78 = "arith.divsi"(%77, %0) : (index, index) -> index | |
| %79 = "arith.subi"(%1, %78) : (index, index) -> index | |
| %80 = "arith.select"(%75, %79, %78) : (i1, index, index) -> index | |
| %81 = "arith.addi"(%arg1, %80) : (index, index) -> index | |
| %82 = "memref.load"(%39, %81) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %83 = "vector.splat"(%74) : (f32) -> vector<4xf32> | |
| %84 = "arith.mulf"(%66, %83) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %85 = "vector.splat"(%82) : (f32) -> vector<4xf32> | |
| %86 = "arith.addf"(%84, %85) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %87 = "arith.negf"(%86) {fastmath = #arith.fastmath<none>} : (vector<4xf32>) -> vector<4xf32> | |
| %88 = "arith.cmpf"(%87, %87) {predicate = 14 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
| %89 = "arith.mulf"(%87, %11) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %90 = "math.floor"(%89) {fastmath = #arith.fastmath<none>} : (vector<4xf32>) -> vector<4xf32> | |
| %91 = "arith.mulf"(%90, %10) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %92 = "arith.subf"(%87, %91) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %93 = "arith.mulf"(%92, %92) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %94 = "arith.mulf"(%93, %93) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %95 = "math.fma"(%23, %92, %23) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %96 = "math.fma"(%13, %92, %12) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %97 = "math.fma"(%15, %92, %14) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %98 = "math.fma"(%96, %93, %95) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %99 = "math.fma"(%97, %94, %98) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %100 = "arith.fptosi"(%90) : (vector<4xf32>) -> vector<4xi32> | |
| %101 = "arith.addi"(%100, %21) : (vector<4xi32>, vector<4xi32>) -> vector<4xi32> | |
| %102 = "arith.shli"(%101, %16) : (vector<4xi32>, vector<4xi32>) -> vector<4xi32> | |
| %103 = "arith.bitcast"(%102) : (vector<4xi32>) -> vector<4xf32> | |
| %104 = "arith.mulf"(%99, %103) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %105 = "arith.cmpi"(%100, %21) {predicate = 3 : i64} : (vector<4xi32>, vector<4xi32>) -> vector<4xi1> | |
| %106 = "arith.cmpi"(%100, %22) {predicate = 5 : i64} : (vector<4xi32>, vector<4xi32>) -> vector<4xi1> | |
| %107 = "arith.cmpf"(%87, %19) {predicate = 1 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
| %108 = "arith.cmpf"(%87, %18) {predicate = 1 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
| %109 = "arith.cmpf"(%87, %17) {predicate = 2 : i64} : (vector<4xf32>, vector<4xf32>) -> vector<4xi1> | |
| %110 = "arith.andi"(%105, %106) : (vector<4xi1>, vector<4xi1>) -> vector<4xi1> | |
| %111 = "arith.select"(%109, %18, %20) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %112 = "arith.select"(%110, %104, %111) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %113 = "arith.select"(%108, %18, %112) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %114 = "arith.select"(%107, %17, %113) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %115 = "arith.select"(%88, %87, %114) : (vector<4xi1>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %116 = "arith.addf"(%115, %23) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %117 = "arith.divf"(%23, %116) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %118 = "arith.mulf"(%117, %86) {fastmath = #arith.fastmath<none>} : (vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %119 = "arith.cmpi"(%33, %8) {predicate = 2 : i64} : (index, index) -> i1 | |
| %120 = "arith.subi"(%1, %33) : (index, index) -> index | |
| %121 = "arith.select"(%119, %120, %33) : (i1, index, index) -> index | |
| %122 = "arith.divsi"(%121, %2) : (index, index) -> index | |
| %123 = "arith.subi"(%1, %122) : (index, index) -> index | |
| %124 = "arith.select"(%119, %123, %122) : (i1, index, index) -> index | |
| %125 = "arith.addi"(%58, %124) : (index, index) -> index | |
| "memref.store"(%118, %41, %125) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [8, 4, 1]>, sym_name = "forward_dispatch_14_generic_2x320x96x96"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| "hal.executable_end"() : () -> () | |
| }) {sym_name = "forward_dispatch_14", sym_visibility = "private"} : () -> () | |
| %51 = linalg.generic {indexing_maps = [#map8, #map8, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %49 : tensor<2x320x96x96xf32>, tensor<2x320x96x96xf32>) outs(%25 : tensor<2x320x96x96xf32>) { | |
| ^ | |
| /home/prashant/stable.mlir:957:11: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
| %51 = linalg.generic {indexing_maps = [#map8, #map8, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %49 : tensor<2x320x96x96xf32>, tensor<2x320x96x96xf32>) outs(%25 : tensor<2x320x96x96xf32>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:957:11: note: see current operation: %38 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %51 = linalg.generic {indexing_maps = [#map8, #map8, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %49 : tensor<2x320x96x96xf32>, tensor<2x320x96x96xf32>) outs(%25 : tensor<2x320x96x96xf32>) { | |
| ^ | |
| /home/prashant/stable.mlir:957:11: note: see existing live user here: %39 = "spirv.UConvert"(%38) : (i32) -> i64 | |
| /home/prashant/stable.mlir:962:19: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
| %padded_733 = tensor.pad %51 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:962:19: note: see current operation: | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
| %0 = "arith.constant"() {value = 3 : index} : () -> index | |
| %1 = "arith.constant"() {value = 96 : index} : () -> index | |
| %2 = "arith.constant"() {value = 640 : index} : () -> index | |
| "hal.return"(%0, %1, %2) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 1, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_15", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = 17845699 : index} : () -> index | |
| %1 = "arith.constant"() {value = 98 : index} : () -> index | |
| %2 = "arith.constant"() {value = 9604 : index} : () -> index | |
| %3 = "arith.constant"() {value = 3073280 : index} : () -> index | |
| %4 = "arith.constant"() {value = -1 : index} : () -> index | |
| %5 = "arith.constant"() {value = 4 : index} : () -> index | |
| %6 = "arith.constant"() {value = 32 : index} : () -> index | |
| %7 = "arith.constant"() {value = 96 : index} : () -> index | |
| %8 = "arith.constant"() {value = 9216 : index} : () -> index | |
| %9 = "arith.constant"() {value = 2949120 : index} : () -> index | |
| %10 = "arith.constant"() {value = 0 : index} : () -> index | |
| %11 = "arith.constant"() {value = 6146560 : index} : () -> index | |
| %12 = "arith.constant"() {value = 5898240 : index} : () -> index | |
| %13 = "arith.constant"() {value = 320 : index} : () -> index | |
| %14 = "arith.constant"() {value = 2 : index} : () -> index | |
| %15 = "arith.constant"() {value = 71382400 : index} : () -> index | |
| %16 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %17 = "arith.index_castui"(%16) : (i32) -> index | |
| %18 = "hal.interface.binding.subspan"(%17, %12) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %19 = "hal.interface.binding.subspan"(%10, %12) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %20 = "hal.interface.binding.subspan"(%15, %11) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %21 = "hal.interface.binding.subspan"(%10, %11) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %22 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %23 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %24 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
| %25 = "arith.remui"(%24, %13) : (index, index) -> index | |
| %26 = "arith.divui"(%24, %13) : (index, index) -> index | |
| %27 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| %28 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
| %29 = "gpu.thread_id"() {dimension = #gpu<dim z>} : () -> index | |
| "scf.for"(%26, %14, %14) ({ | |
| ^bb0(%arg0: index): | |
| "scf.for"(%25, %13, %13) ({ | |
| ^bb0(%arg1: index): | |
| %30 = "arith.muli"(%arg0, %9) : (index, index) -> index | |
| %31 = "arith.muli"(%arg1, %8) : (index, index) -> index | |
| %32 = "arith.addi"(%30, %31) : (index, index) -> index | |
| %33 = "arith.muli"(%29, %8) : (index, index) -> index | |
| %34 = "arith.addi"(%32, %33) : (index, index) -> index | |
| %35 = "arith.muli"(%23, %7) : (index, index) -> index | |
| %36 = "arith.addi"(%34, %35) : (index, index) -> index | |
| %37 = "arith.muli"(%28, %7) : (index, index) -> index | |
| %38 = "arith.addi"(%36, %37) : (index, index) -> index | |
| %39 = "arith.muli"(%22, %6) : (index, index) -> index | |
| %40 = "arith.addi"(%38, %39) : (index, index) -> index | |
| %41 = "arith.addi"(%40, %27) : (index, index) -> index | |
| %42 = "arith.cmpi"(%17, %10) {predicate = 2 : i64} : (index, index) -> i1 | |
| %43 = "arith.subi"(%4, %17) : (index, index) -> index | |
| %44 = "arith.select"(%42, %43, %17) : (i1, index, index) -> index | |
| %45 = "arith.divsi"(%44, %5) : (index, index) -> index | |
| %46 = "arith.subi"(%4, %45) : (index, index) -> index | |
| %47 = "arith.select"(%42, %46, %45) : (i1, index, index) -> index | |
| %48 = "arith.addi"(%41, %47) : (index, index) -> index | |
| %49 = "memref.load"(%19, %48) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %50 = "arith.muli"(%arg0, %3) : (index, index) -> index | |
| %51 = "arith.muli"(%arg1, %2) : (index, index) -> index | |
| %52 = "arith.addi"(%50, %51) : (index, index) -> index | |
| %53 = "arith.muli"(%29, %2) : (index, index) -> index | |
| %54 = "arith.addi"(%52, %53) : (index, index) -> index | |
| %55 = "arith.muli"(%23, %1) : (index, index) -> index | |
| %56 = "arith.addi"(%54, %55) : (index, index) -> index | |
| %57 = "arith.muli"(%28, %1) : (index, index) -> index | |
| %58 = "arith.addi"(%56, %57) : (index, index) -> index | |
| %59 = "arith.addi"(%58, %39) : (index, index) -> index | |
| %60 = "arith.addi"(%59, %27) : (index, index) -> index | |
| %61 = "arith.addi"(%60, %0) : (index, index) -> index | |
| "memref.store"(%49, %21, %61) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_15"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| %padded_733 = tensor.pad %51 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
| ^ | |
| /home/prashant/stable.mlir:962:19: error: failed to serialize executables | |
| %padded_733 = tensor.pad %51 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:962:19: note: see current operation: | |
| "hal.executable"() ({ | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): | |
| %0 = "arith.constant"() {value = 3 : index} : () -> index | |
| %1 = "arith.constant"() {value = 96 : index} : () -> index | |
| %2 = "arith.constant"() {value = 640 : index} : () -> index | |
| "hal.return"(%0, %1, %2) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 1, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_15", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = 17845699 : index} : () -> index | |
| %1 = "arith.constant"() {value = 98 : index} : () -> index | |
| %2 = "arith.constant"() {value = 9604 : index} : () -> index | |
| %3 = "arith.constant"() {value = 3073280 : index} : () -> index | |
| %4 = "arith.constant"() {value = -1 : index} : () -> index | |
| %5 = "arith.constant"() {value = 4 : index} : () -> index | |
| %6 = "arith.constant"() {value = 32 : index} : () -> index | |
| %7 = "arith.constant"() {value = 96 : index} : () -> index | |
| %8 = "arith.constant"() {value = 9216 : index} : () -> index | |
| %9 = "arith.constant"() {value = 2949120 : index} : () -> index | |
| %10 = "arith.constant"() {value = 0 : index} : () -> index | |
| %11 = "arith.constant"() {value = 6146560 : index} : () -> index | |
| %12 = "arith.constant"() {value = 5898240 : index} : () -> index | |
| %13 = "arith.constant"() {value = 320 : index} : () -> index | |
| %14 = "arith.constant"() {value = 2 : index} : () -> index | |
| %15 = "arith.constant"() {value = 71382400 : index} : () -> index | |
| %16 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %17 = "arith.index_castui"(%16) : (i32) -> index | |
| %18 = "hal.interface.binding.subspan"(%17, %12) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %19 = "hal.interface.binding.subspan"(%10, %12) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %20 = "hal.interface.binding.subspan"(%15, %11) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %21 = "hal.interface.binding.subspan"(%10, %11) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>> | |
| %22 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %23 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %24 = "hal.interface.workgroup.id"() {dimension = 2 : index} : () -> index | |
| %25 = "arith.remui"(%24, %13) : (index, index) -> index | |
| %26 = "arith.divui"(%24, %13) : (index, index) -> index | |
| %27 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| %28 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
| %29 = "gpu.thread_id"() {dimension = #gpu<dim z>} : () -> index | |
| "scf.for"(%26, %14, %14) ({ | |
| ^bb0(%arg0: index): | |
| "scf.for"(%25, %13, %13) ({ | |
| ^bb0(%arg1: index): | |
| %30 = "arith.muli"(%arg0, %9) : (index, index) -> index | |
| %31 = "arith.muli"(%arg1, %8) : (index, index) -> index | |
| %32 = "arith.addi"(%30, %31) : (index, index) -> index | |
| %33 = "arith.muli"(%29, %8) : (index, index) -> index | |
| %34 = "arith.addi"(%32, %33) : (index, index) -> index | |
| %35 = "arith.muli"(%23, %7) : (index, index) -> index | |
| %36 = "arith.addi"(%34, %35) : (index, index) -> index | |
| %37 = "arith.muli"(%28, %7) : (index, index) -> index | |
| %38 = "arith.addi"(%36, %37) : (index, index) -> index | |
| %39 = "arith.muli"(%22, %6) : (index, index) -> index | |
| %40 = "arith.addi"(%38, %39) : (index, index) -> index | |
| %41 = "arith.addi"(%40, %27) : (index, index) -> index | |
| %42 = "arith.cmpi"(%17, %10) {predicate = 2 : i64} : (index, index) -> i1 | |
| %43 = "arith.subi"(%4, %17) : (index, index) -> index | |
| %44 = "arith.select"(%42, %43, %17) : (i1, index, index) -> index | |
| %45 = "arith.divsi"(%44, %5) : (index, index) -> index | |
| %46 = "arith.subi"(%4, %45) : (index, index) -> index | |
| %47 = "arith.select"(%42, %46, %45) : (i1, index, index) -> index | |
| %48 = "arith.addi"(%41, %47) : (index, index) -> index | |
| %49 = "memref.load"(%19, %48) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32 | |
| %50 = "arith.muli"(%arg0, %3) : (index, index) -> index | |
| %51 = "arith.muli"(%arg1, %2) : (index, index) -> index | |
| %52 = "arith.addi"(%50, %51) : (index, index) -> index | |
| %53 = "arith.muli"(%29, %2) : (index, index) -> index | |
| %54 = "arith.addi"(%52, %53) : (index, index) -> index | |
| %55 = "arith.muli"(%23, %1) : (index, index) -> index | |
| %56 = "arith.addi"(%54, %55) : (index, index) -> index | |
| %57 = "arith.muli"(%28, %1) : (index, index) -> index | |
| %58 = "arith.addi"(%56, %57) : (index, index) -> index | |
| %59 = "arith.addi"(%58, %39) : (index, index) -> index | |
| %60 = "arith.addi"(%59, %27) : (index, index) -> index | |
| %61 = "arith.addi"(%60, %0) : (index, index) -> index | |
| "memref.store"(%49, %21, %61) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "scf.yield"() : () -> () | |
| }) : (index, index, index) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_15"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| "hal.executable_end"() : () -> () | |
| }) {sym_name = "forward_dispatch_15", sym_visibility = "private"} : () -> () | |
| %padded_733 = tensor.pad %51 low[0, 0, 1, 1] high[0, 0, 1, 1] { | |
| ^ | |
| /home/prashant/stable.mlir:1313:12: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion | |
| %130 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel"]} ins(%cst_662 : tensor<320x320xf32>) outs(%101 : tensor<320x320xf32>) { | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:1313:12: note: see current operation: %187 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %130 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel"]} ins(%cst_662 : tensor<320x320xf32>) outs(%101 : tensor<320x320xf32>) { | |
| ^ | |
| /home/prashant/stable.mlir:1313:12: note: see existing live user here: %195 = "spirv.UConvert"(%187) : (i32) -> i64 | |
| /home/prashant/stable.mlir:1320:12: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> | |
| %133 = linalg.matmul ins(%collapsed_749, %130 : tensor<18432x320xf32>, tensor<320x320xf32>) outs(%132 : tensor<18432x320xf32>) -> tensor<18432x320xf32> | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:1320:12: note: see current operation: | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
| %0 = "arith.constant"() {value = 5 : index} : () -> index | |
| %1 = "arith.constant"() {value = 288 : index} : () -> index | |
| %2 = "arith.constant"() {value = 1 : index} : () -> index | |
| "hal.return"(%0, %1, %2) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_35_matmul_18432x320x320", translation_info = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize pipeline_depth = 1>, workgroup_size = [16 : index, 16 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 2 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_2_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = -33 : index} : () -> index | |
| %1 = "arith.constant"() {value = 527 : index} : () -> index | |
| %2 = "arith.constant"() {value = 510 : index} : () -> index | |
| %3 = "arith.constant"() {value = 493 : index} : () -> index | |
| %4 = "arith.constant"() {value = 476 : index} : () -> index | |
| %5 = "arith.constant"() {value = 459 : index} : () -> index | |
| %6 = "arith.constant"() {value = 442 : index} : () -> index | |
| %7 = "arith.constant"() {value = 425 : index} : () -> index | |
| %8 = "arith.constant"() {value = 408 : index} : () -> index | |
| %9 = "arith.constant"() {value = 391 : index} : () -> index | |
| %10 = "arith.constant"() {value = 374 : index} : () -> index | |
| %11 = "arith.constant"() {value = 357 : index} : () -> index | |
| %12 = "arith.constant"() {value = 340 : index} : () -> index | |
| %13 = "arith.constant"() {value = 323 : index} : () -> index | |
| %14 = "arith.constant"() {value = 306 : index} : () -> index | |
| %15 = "arith.constant"() {value = 289 : index} : () -> index | |
| %16 = "arith.constant"() {value = 255 : index} : () -> index | |
| %17 = "arith.constant"() {value = 238 : index} : () -> index | |
| %18 = "arith.constant"() {value = 221 : index} : () -> index | |
| %19 = "arith.constant"() {value = 204 : index} : () -> index | |
| %20 = "arith.constant"() {value = 187 : index} : () -> index | |
| %21 = "arith.constant"() {value = 170 : index} : () -> index | |
| %22 = "arith.constant"() {value = 153 : index} : () -> index | |
| %23 = "arith.constant"() {value = 136 : index} : () -> index | |
| %24 = "arith.constant"() {value = 119 : index} : () -> index | |
| %25 = "arith.constant"() {value = 102 : index} : () -> index | |
| %26 = "arith.constant"() {value = 85 : index} : () -> index | |
| %27 = "arith.constant"() {value = 68 : index} : () -> index | |
| %28 = "arith.constant"() {value = 51 : index} : () -> index | |
| %29 = "arith.constant"() {value = 34 : index} : () -> index | |
| %30 = "arith.constant"() {value = 33 : index} : () -> index | |
| %31 = "arith.constant"() {value = 31 : index} : () -> index | |
| %32 = "arith.constant"() {value = 30 : index} : () -> index | |
| %33 = "arith.constant"() {value = 29 : index} : () -> index | |
| %34 = "arith.constant"() {value = 28 : index} : () -> index | |
| %35 = "arith.constant"() {value = 27 : index} : () -> index | |
| %36 = "arith.constant"() {value = 25 : index} : () -> index | |
| %37 = "arith.constant"() {value = 24 : index} : () -> index | |
| %38 = "arith.constant"() {value = 23 : index} : () -> index | |
| %39 = "arith.constant"() {value = 22 : index} : () -> index | |
| %40 = "arith.constant"() {value = 21 : index} : () -> index | |
| %41 = "arith.constant"() {value = 20 : index} : () -> index | |
| %42 = "arith.constant"() {value = 19 : index} : () -> index | |
| %43 = "arith.constant"() {value = 15 : index} : () -> index | |
| %44 = "arith.constant"() {value = 14 : index} : () -> index | |
| %45 = "arith.constant"() {value = 13 : index} : () -> index | |
| %46 = "arith.constant"() {value = 12 : index} : () -> index | |
| %47 = "arith.constant"() {value = 11 : index} : () -> index | |
| %48 = "arith.constant"() {value = 10 : index} : () -> index | |
| %49 = "arith.constant"() {value = 9 : index} : () -> index | |
| %50 = "arith.constant"() {value = 7 : index} : () -> index | |
| %51 = "arith.constant"() {value = 6 : index} : () -> index | |
| %52 = "arith.constant"() {value = 5 : index} : () -> index | |
| %53 = "arith.constant"() {value = 4 : index} : () -> index | |
| %54 = "arith.constant"() {value = 3 : index} : () -> index | |
| %55 = "arith.constant"() {value = 2 : index} : () -> index | |
| %56 = "arith.constant"() {value = 1 : index} : () -> index | |
| %57 = "arith.constant"() {value = 36 : index} : () -> index | |
| %58 = "arith.constant"() {value = 272 : index} : () -> index | |
| %59 = "arith.constant"() {value = 17 : index} : () -> index | |
| %60 = "arith.constant"() {value = 18 : index} : () -> index | |
| %61 = "arith.constant"() {value = 64 : index} : () -> index | |
| %62 = "arith.constant"() {value = 1280 : index} : () -> index | |
| %63 = "arith.constant"() {value = 1477120 : index} : () -> index | |
| %64 = "arith.constant"() {value = 72 : index} : () -> index | |
| %65 = "arith.constant"() {value = 8 : index} : () -> index | |
| %66 = "arith.constant"() {value = 2560 : index} : () -> index | |
| %67 = "arith.constant"() {value = 240 : index} : () -> index | |
| %68 = "arith.constant"() {value = 160 : index} : () -> index | |
| %69 = "arith.constant"() {value = 80 : index} : () -> index | |
| %70 = "arith.constant"() {value = -1 : index} : () -> index | |
| %71 = "arith.constant"() {value = 16 : index} : () -> index | |
| %72 = "arith.constant"() {value = 320 : index} : () -> index | |
| %73 = "arith.constant"() {value = 5120 : index} : () -> index | |
| %74 = "arith.constant"() {value = 0 : index} : () -> index | |
| %75 = "arith.constant"() {value = 1474560 : index} : () -> index | |
| %76 = "arith.constant"() {value = 25600 : index} : () -> index | |
| %77 = "arith.constant"() {value = 288 : index} : () -> index | |
| %78 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
| %79 = "arith.constant"() {value = 23592960 : index} : () -> index | |
| %80 = "arith.constant"() {value = 32 : index} : () -> index | |
| %81 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| %82 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
| %83 = "gpu.thread_id"() {dimension = #gpu<dim z>} : () -> index | |
| %84 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>> | |
| %85 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>> | |
| %86 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %87 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
| %88 = "arith.index_castui"(%86) : (i32) -> index | |
| %89 = "arith.index_castui"(%87) : (i32) -> index | |
| %90 = "hal.interface.binding.subspan"(%79, %75) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %91 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %92 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %93 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %94 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %95 = "hal.interface.binding.subspan"(%88, %76) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %96 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %97 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %98 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %99 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %100 = "hal.interface.binding.subspan"(%89, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %101 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %102 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %103 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %104 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %105 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %106 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %107 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %108 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %109 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %110 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %111 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %112 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %113 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %114 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %115 = "arith.muli"(%114, %73) : (index, index) -> index | |
| %116 = "arith.muli"(%82, %72) : (index, index) -> index | |
| %117 = "arith.addi"(%115, %116) : (index, index) -> index | |
| %118 = "arith.muli"(%113, %71) : (index, index) -> index | |
| %119 = "arith.addi"(%117, %118) : (index, index) -> index | |
| %120 = "arith.addi"(%119, %81) : (index, index) -> index | |
| %121 = "arith.cmpi"(%89, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
| %122 = "arith.subi"(%70, %89) : (index, index) -> index | |
| %123 = "arith.select"(%121, %122, %89) : (i1, index, index) -> index | |
| %124 = "arith.divsi"(%123, %71) : (index, index) -> index | |
| %125 = "arith.subi"(%70, %124) : (index, index) -> index | |
| %126 = "arith.select"(%121, %125, %124) : (i1, index, index) -> index | |
| %127 = "arith.addi"(%120, %126) : (index, index) -> index | |
| "memref.store"(%78, %101, %127) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| %128 = "arith.addi"(%127, %69) : (index, index) -> index | |
| "memref.store"(%78, %102, %128) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| %129 = "arith.addi"(%127, %68) : (index, index) -> index | |
| "memref.store"(%78, %103, %129) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| %130 = "arith.addi"(%127, %67) : (index, index) -> index | |
| "memref.store"(%78, %104, %130) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| %131 = "memref.load"(%105, %127) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %132 = "memref.load"(%106, %128) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %133 = "memref.load"(%107, %129) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %134 = "memref.load"(%108, %130) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %135 = "arith.addi"(%115, %81) : (index, index) -> index | |
| %136 = "arith.muli"(%82, %68) : (index, index) -> index | |
| %137 = "arith.addi"(%135, %136) : (index, index) -> index | |
| %138 = "arith.muli"(%83, %66) : (index, index) -> index | |
| %139 = "arith.addi"(%137, %138) : (index, index) -> index | |
| %140 = "arith.cmpi"(%81, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
| %141 = "arith.subi"(%70, %81) : (index, index) -> index | |
| %142 = "arith.select"(%140, %141, %81) : (i1, index, index) -> index | |
| %143 = "arith.divsi"(%142, %65) : (index, index) -> index | |
| %144 = "arith.subi"(%70, %143) : (index, index) -> index | |
| %145 = "arith.select"(%140, %144, %143) : (i1, index, index) -> index | |
| %146 = "arith.muli"(%145, %64) : (index, index) -> index | |
| %147 = "arith.addi"(%139, %146) : (index, index) -> index | |
| %148 = "arith.addi"(%147, %75) : (index, index) -> index | |
| %149 = "memref.load"(%91, %148) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %150 = "arith.addi"(%147, %63) : (index, index) -> index | |
| %151 = "memref.load"(%92, %150) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %152 = "arith.muli"(%82, %69) : (index, index) -> index | |
| %153 = "arith.addi"(%81, %152) : (index, index) -> index | |
| %154 = "arith.muli"(%83, %62) : (index, index) -> index | |
| %155 = "arith.addi"(%153, %154) : (index, index) -> index | |
| %156 = "arith.addi"(%155, %118) : (index, index) -> index | |
| %157 = "arith.divsi"(%142, %71) : (index, index) -> index | |
| %158 = "arith.subi"(%70, %157) : (index, index) -> index | |
| %159 = "arith.select"(%140, %158, %157) : (i1, index, index) -> index | |
| %160 = "arith.muli"(%159, %61) : (index, index) -> index | |
| %161 = "arith.addi"(%156, %160) : (index, index) -> index | |
| %162 = "arith.cmpi"(%88, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
| %163 = "arith.subi"(%70, %88) : (index, index) -> index | |
| %164 = "arith.select"(%162, %163, %88) : (i1, index, index) -> index | |
| %165 = "arith.divsi"(%164, %71) : (index, index) -> index | |
| %166 = "arith.subi"(%70, %165) : (index, index) -> index | |
| %167 = "arith.select"(%162, %166, %165) : (i1, index, index) -> index | |
| %168 = "arith.addi"(%161, %167) : (index, index) -> index | |
| %169 = "memref.load"(%96, %168) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %170 = "arith.addi"(%168, %62) : (index, index) -> index | |
| %171 = "memref.load"(%97, %170) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %172:8 = "scf.for"(%74, %77, %80, %131, %132, %133, %134, %149, %151, %169, %171) ({ | |
| ^bb0(%arg0: index, %arg1: vector<4xf32>, %arg2: vector<4xf32>, %arg3: vector<4xf32>, %arg4: vector<4xf32>, %arg5: vector<4xf32>, %arg6: vector<4xf32>, %arg7: vector<4xf32>, %arg8: vector<4xf32>): | |
| "gpu.barrier"() : () -> () | |
| %696 = "arith.muli"(%82, %60) : (index, index) -> index | |
| %697 = "arith.addi"(%81, %696) : (index, index) -> index | |
| %698 = "arith.muli"(%83, %77) : (index, index) -> index | |
| %699 = "arith.addi"(%697, %698) : (index, index) -> index | |
| %700 = "arith.addi"(%699, %145) : (index, index) -> index | |
| "memref.store"(%arg5, %84, %700) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| %701 = "arith.addi"(%700, %77) : (index, index) -> index | |
| "memref.store"(%arg6, %84, %701) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| %702 = "arith.muli"(%82, %59) : (index, index) -> index | |
| %703 = "arith.addi"(%81, %702) : (index, index) -> index | |
| %704 = "arith.muli"(%83, %58) : (index, index) -> index | |
| %705 = "arith.addi"(%703, %704) : (index, index) -> index | |
| %706 = "arith.addi"(%705, %159) : (index, index) -> index | |
| "memref.store"(%arg7, %85, %706) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| %707 = "arith.addi"(%706, %58) : (index, index) -> index | |
| "memref.store"(%arg8, %85, %707) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| "gpu.barrier"() : () -> () | |
| %708 = "arith.muli"(%82, %57) : (index, index) -> index | |
| %709 = "memref.load"(%84, %708) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %710 = "arith.addi"(%708, %56) : (index, index) -> index | |
| %711 = "memref.load"(%84, %710) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %712 = "arith.addi"(%708, %55) : (index, index) -> index | |
| %713 = "memref.load"(%84, %712) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %714 = "arith.addi"(%708, %54) : (index, index) -> index | |
| %715 = "memref.load"(%84, %714) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %716 = "arith.addi"(%708, %53) : (index, index) -> index | |
| %717 = "memref.load"(%84, %716) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %718 = "arith.addi"(%708, %52) : (index, index) -> index | |
| %719 = "memref.load"(%84, %718) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %720 = "arith.addi"(%708, %51) : (index, index) -> index | |
| %721 = "memref.load"(%84, %720) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %722 = "arith.addi"(%708, %50) : (index, index) -> index | |
| %723 = "memref.load"(%84, %722) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %724 = "arith.addi"(%708, %49) : (index, index) -> index | |
| %725 = "memref.load"(%84, %724) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %726 = "arith.addi"(%708, %48) : (index, index) -> index | |
| %727 = "memref.load"(%84, %726) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %728 = "arith.addi"(%708, %47) : (index, index) -> index | |
| %729 = "memref.load"(%84, %728) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %730 = "arith.addi"(%708, %46) : (index, index) -> index | |
| %731 = "memref.load"(%84, %730) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %732 = "arith.addi"(%708, %45) : (index, index) -> index | |
| %733 = "memref.load"(%84, %732) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %734 = "arith.addi"(%708, %44) : (index, index) -> index | |
| %735 = "memref.load"(%84, %734) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %736 = "arith.addi"(%708, %43) : (index, index) -> index | |
| %737 = "memref.load"(%84, %736) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %738 = "arith.addi"(%708, %71) : (index, index) -> index | |
| %739 = "memref.load"(%84, %738) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %740 = "arith.addi"(%708, %60) : (index, index) -> index | |
| %741 = "memref.load"(%84, %740) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %742 = "arith.addi"(%708, %42) : (index, index) -> index | |
| %743 = "memref.load"(%84, %742) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %744 = "arith.addi"(%708, %41) : (index, index) -> index | |
| %745 = "memref.load"(%84, %744) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %746 = "arith.addi"(%708, %40) : (index, index) -> index | |
| %747 = "memref.load"(%84, %746) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %748 = "arith.addi"(%708, %39) : (index, index) -> index | |
| %749 = "memref.load"(%84, %748) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %750 = "arith.addi"(%708, %38) : (index, index) -> index | |
| %751 = "memref.load"(%84, %750) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %752 = "arith.addi"(%708, %37) : (index, index) -> index | |
| %753 = "memref.load"(%84, %752) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %754 = "arith.addi"(%708, %36) : (index, index) -> index | |
| %755 = "memref.load"(%84, %754) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %756 = "arith.addi"(%708, %35) : (index, index) -> index | |
| %757 = "memref.load"(%84, %756) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %758 = "arith.addi"(%708, %34) : (index, index) -> index | |
| %759 = "memref.load"(%84, %758) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %760 = "arith.addi"(%708, %33) : (index, index) -> index | |
| %761 = "memref.load"(%84, %760) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %762 = "arith.addi"(%708, %32) : (index, index) -> index | |
| %763 = "memref.load"(%84, %762) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %764 = "arith.addi"(%708, %31) : (index, index) -> index | |
| %765 = "memref.load"(%84, %764) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %766 = "arith.addi"(%708, %80) : (index, index) -> index | |
| %767 = "memref.load"(%84, %766) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %768 = "arith.addi"(%708, %30) : (index, index) -> index | |
| %769 = "memref.load"(%84, %768) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %770 = "arith.addi"(%708, %29) : (index, index) -> index | |
| %771 = "memref.load"(%84, %770) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %772 = "memref.load"(%85, %81) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %773 = "arith.addi"(%81, %59) : (index, index) -> index | |
| %774 = "memref.load"(%85, %773) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %775 = "arith.addi"(%81, %29) : (index, index) -> index | |
| %776 = "memref.load"(%85, %775) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %777 = "arith.addi"(%81, %28) : (index, index) -> index | |
| %778 = "memref.load"(%85, %777) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %779 = "arith.addi"(%81, %27) : (index, index) -> index | |
| %780 = "memref.load"(%85, %779) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %781 = "arith.addi"(%81, %26) : (index, index) -> index | |
| %782 = "memref.load"(%85, %781) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %783 = "arith.addi"(%81, %25) : (index, index) -> index | |
| %784 = "memref.load"(%85, %783) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %785 = "arith.addi"(%81, %24) : (index, index) -> index | |
| %786 = "memref.load"(%85, %785) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %787 = "arith.addi"(%81, %23) : (index, index) -> index | |
| %788 = "memref.load"(%85, %787) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %789 = "arith.addi"(%81, %22) : (index, index) -> index | |
| %790 = "memref.load"(%85, %789) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %791 = "arith.addi"(%81, %21) : (index, index) -> index | |
| %792 = "memref.load"(%85, %791) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %793 = "arith.addi"(%81, %20) : (index, index) -> index | |
| %794 = "memref.load"(%85, %793) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %795 = "arith.addi"(%81, %19) : (index, index) -> index | |
| %796 = "memref.load"(%85, %795) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %797 = "arith.addi"(%81, %18) : (index, index) -> index | |
| %798 = "memref.load"(%85, %797) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %799 = "arith.addi"(%81, %17) : (index, index) -> index | |
| %800 = "memref.load"(%85, %799) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %801 = "arith.addi"(%81, %16) : (index, index) -> index | |
| %802 = "memref.load"(%85, %801) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %803 = "arith.addi"(%81, %58) : (index, index) -> index | |
| %804 = "memref.load"(%85, %803) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %805 = "arith.addi"(%81, %15) : (index, index) -> index | |
| %806 = "memref.load"(%85, %805) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %807 = "arith.addi"(%81, %14) : (index, index) -> index | |
| %808 = "memref.load"(%85, %807) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %809 = "arith.addi"(%81, %13) : (index, index) -> index | |
| %810 = "memref.load"(%85, %809) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %811 = "arith.addi"(%81, %12) : (index, index) -> index | |
| %812 = "memref.load"(%85, %811) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %813 = "arith.addi"(%81, %11) : (index, index) -> index | |
| %814 = "memref.load"(%85, %813) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %815 = "arith.addi"(%81, %10) : (index, index) -> index | |
| %816 = "memref.load"(%85, %815) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %817 = "arith.addi"(%81, %9) : (index, index) -> index | |
| %818 = "memref.load"(%85, %817) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %819 = "arith.addi"(%81, %8) : (index, index) -> index | |
| %820 = "memref.load"(%85, %819) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %821 = "arith.addi"(%81, %7) : (index, index) -> index | |
| %822 = "memref.load"(%85, %821) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %823 = "arith.addi"(%81, %6) : (index, index) -> index | |
| %824 = "memref.load"(%85, %823) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %825 = "arith.addi"(%81, %5) : (index, index) -> index | |
| %826 = "memref.load"(%85, %825) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %827 = "arith.addi"(%81, %4) : (index, index) -> index | |
| %828 = "memref.load"(%85, %827) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %829 = "arith.addi"(%81, %3) : (index, index) -> index | |
| %830 = "memref.load"(%85, %829) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %831 = "arith.addi"(%81, %2) : (index, index) -> index | |
| %832 = "memref.load"(%85, %831) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %833 = "arith.addi"(%81, %1) : (index, index) -> index | |
| %834 = "memref.load"(%85, %833) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %835 = "vector.extract"(%709) {position = [0]} : (vector<4xf32>) -> f32 | |
| %836 = "vector.splat"(%835) : (f32) -> vector<4xf32> | |
| %837 = "vector.fma"(%836, %772, %arg1) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %838 = "vector.extract"(%709) {position = [1]} : (vector<4xf32>) -> f32 | |
| %839 = "vector.splat"(%838) : (f32) -> vector<4xf32> | |
| %840 = "vector.fma"(%839, %774, %837) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %841 = "vector.extract"(%709) {position = [2]} : (vector<4xf32>) -> f32 | |
| %842 = "vector.splat"(%841) : (f32) -> vector<4xf32> | |
| %843 = "vector.fma"(%842, %776, %840) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %844 = "vector.extract"(%709) {position = [3]} : (vector<4xf32>) -> f32 | |
| %845 = "vector.splat"(%844) : (f32) -> vector<4xf32> | |
| %846 = "vector.fma"(%845, %778, %843) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %847 = "vector.extract"(%711) {position = [0]} : (vector<4xf32>) -> f32 | |
| %848 = "vector.splat"(%847) : (f32) -> vector<4xf32> | |
| %849 = "vector.fma"(%848, %780, %846) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %850 = "vector.extract"(%711) {position = [1]} : (vector<4xf32>) -> f32 | |
| %851 = "vector.splat"(%850) : (f32) -> vector<4xf32> | |
| %852 = "vector.fma"(%851, %782, %849) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %853 = "vector.extract"(%711) {position = [2]} : (vector<4xf32>) -> f32 | |
| %854 = "vector.splat"(%853) : (f32) -> vector<4xf32> | |
| %855 = "vector.fma"(%854, %784, %852) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %856 = "vector.extract"(%711) {position = [3]} : (vector<4xf32>) -> f32 | |
| %857 = "vector.splat"(%856) : (f32) -> vector<4xf32> | |
| %858 = "vector.fma"(%857, %786, %855) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %859 = "vector.extract"(%713) {position = [0]} : (vector<4xf32>) -> f32 | |
| %860 = "vector.splat"(%859) : (f32) -> vector<4xf32> | |
| %861 = "vector.fma"(%860, %788, %858) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %862 = "vector.extract"(%713) {position = [1]} : (vector<4xf32>) -> f32 | |
| %863 = "vector.splat"(%862) : (f32) -> vector<4xf32> | |
| %864 = "vector.fma"(%863, %790, %861) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %865 = "vector.extract"(%713) {position = [2]} : (vector<4xf32>) -> f32 | |
| %866 = "vector.splat"(%865) : (f32) -> vector<4xf32> | |
| %867 = "vector.fma"(%866, %792, %864) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %868 = "vector.extract"(%713) {position = [3]} : (vector<4xf32>) -> f32 | |
| %869 = "vector.splat"(%868) : (f32) -> vector<4xf32> | |
| %870 = "vector.fma"(%869, %794, %867) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %871 = "vector.extract"(%715) {position = [0]} : (vector<4xf32>) -> f32 | |
| %872 = "vector.splat"(%871) : (f32) -> vector<4xf32> | |
| %873 = "vector.fma"(%872, %796, %870) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %874 = "vector.extract"(%715) {position = [1]} : (vector<4xf32>) -> f32 | |
| %875 = "vector.splat"(%874) : (f32) -> vector<4xf32> | |
| %876 = "vector.fma"(%875, %798, %873) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %877 = "vector.extract"(%715) {position = [2]} : (vector<4xf32>) -> f32 | |
| %878 = "vector.splat"(%877) : (f32) -> vector<4xf32> | |
| %879 = "vector.fma"(%878, %800, %876) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %880 = "vector.extract"(%715) {position = [3]} : (vector<4xf32>) -> f32 | |
| %881 = "vector.splat"(%880) : (f32) -> vector<4xf32> | |
| %882 = "vector.fma"(%881, %802, %879) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %883 = "vector.extract"(%717) {position = [0]} : (vector<4xf32>) -> f32 | |
| %884 = "vector.splat"(%883) : (f32) -> vector<4xf32> | |
| %885 = "vector.fma"(%884, %804, %882) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %886 = "vector.extract"(%717) {position = [1]} : (vector<4xf32>) -> f32 | |
| %887 = "vector.splat"(%886) : (f32) -> vector<4xf32> | |
| %888 = "vector.fma"(%887, %806, %885) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %889 = "vector.extract"(%717) {position = [2]} : (vector<4xf32>) -> f32 | |
| %890 = "vector.splat"(%889) : (f32) -> vector<4xf32> | |
| %891 = "vector.fma"(%890, %808, %888) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %892 = "vector.extract"(%717) {position = [3]} : (vector<4xf32>) -> f32 | |
| %893 = "vector.splat"(%892) : (f32) -> vector<4xf32> | |
| %894 = "vector.fma"(%893, %810, %891) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %895 = "vector.extract"(%719) {position = [0]} : (vector<4xf32>) -> f32 | |
| %896 = "vector.splat"(%895) : (f32) -> vector<4xf32> | |
| %897 = "vector.fma"(%896, %812, %894) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %898 = "vector.extract"(%719) {position = [1]} : (vector<4xf32>) -> f32 | |
| %899 = "vector.splat"(%898) : (f32) -> vector<4xf32> | |
| %900 = "vector.fma"(%899, %814, %897) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %901 = "vector.extract"(%719) {position = [2]} : (vector<4xf32>) -> f32 | |
| %902 = "vector.splat"(%901) : (f32) -> vector<4xf32> | |
| %903 = "vector.fma"(%902, %816, %900) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %904 = "vector.extract"(%719) {position = [3]} : (vector<4xf32>) -> f32 | |
| %905 = "vector.splat"(%904) : (f32) -> vector<4xf32> | |
| %906 = "vector.fma"(%905, %818, %903) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %907 = "vector.extract"(%721) {position = [0]} : (vector<4xf32>) -> f32 | |
| %908 = "vector.splat"(%907) : (f32) -> vector<4xf32> | |
| %909 = "vector.fma"(%908, %820, %906) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %910 = "vector.extract"(%721) {position = [1]} : (vector<4xf32>) -> f32 | |
| %911 = "vector.splat"(%910) : (f32) -> vector<4xf32> | |
| %912 = "vector.fma"(%911, %822, %909) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %913 = "vector.extract"(%721) {position = [2]} : (vector<4xf32>) -> f32 | |
| %914 = "vector.splat"(%913) : (f32) -> vector<4xf32> | |
| %915 = "vector.fma"(%914, %824, %912) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %916 = "vector.extract"(%721) {position = [3]} : (vector<4xf32>) -> f32 | |
| %917 = "vector.splat"(%916) : (f32) -> vector<4xf32> | |
| %918 = "vector.fma"(%917, %826, %915) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %919 = "vector.extract"(%723) {position = [0]} : (vector<4xf32>) -> f32 | |
| %920 = "vector.splat"(%919) : (f32) -> vector<4xf32> | |
| %921 = "vector.fma"(%920, %828, %918) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %922 = "vector.extract"(%723) {position = [1]} : (vector<4xf32>) -> f32 | |
| %923 = "vector.splat"(%922) : (f32) -> vector<4xf32> | |
| %924 = "vector.fma"(%923, %830, %921) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %925 = "vector.extract"(%723) {position = [2]} : (vector<4xf32>) -> f32 | |
| %926 = "vector.splat"(%925) : (f32) -> vector<4xf32> | |
| %927 = "vector.fma"(%926, %832, %924) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %928 = "vector.extract"(%723) {position = [3]} : (vector<4xf32>) -> f32 | |
| %929 = "vector.splat"(%928) : (f32) -> vector<4xf32> | |
| %930 = "vector.fma"(%929, %834, %927) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %931 = "vector.extract"(%725) {position = [0]} : (vector<4xf32>) -> f32 | |
| %932 = "vector.splat"(%931) : (f32) -> vector<4xf32> | |
| %933 = "vector.fma"(%932, %772, %arg2) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %934 = "vector.extract"(%725) {position = [1]} : (vector<4xf32>) -> f32 | |
| %935 = "vector.splat"(%934) : (f32) -> vector<4xf32> | |
| %936 = "vector.fma"(%935, %774, %933) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %937 = "vector.extract"(%725) {position = [2]} : (vector<4xf32>) -> f32 | |
| %938 = "vector.splat"(%937) : (f32) -> vector<4xf32> | |
| %939 = "vector.fma"(%938, %776, %936) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %940 = "vector.extract"(%725) {position = [3]} : (vector<4xf32>) -> f32 | |
| %941 = "vector.splat"(%940) : (f32) -> vector<4xf32> | |
| %942 = "vector.fma"(%941, %778, %939) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %943 = "vector.extract"(%727) {position = [0]} : (vector<4xf32>) -> f32 | |
| %944 = "vector.splat"(%943) : (f32) -> vector<4xf32> | |
| %945 = "vector.fma"(%944, %780, %942) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %946 = "vector.extract"(%727) {position = [1]} : (vector<4xf32>) -> f32 | |
| %947 = "vector.splat"(%946) : (f32) -> vector<4xf32> | |
| %948 = "vector.fma"(%947, %782, %945) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %949 = "vector.extract"(%727) {position = [2]} : (vector<4xf32>) -> f32 | |
| %950 = "vector.splat"(%949) : (f32) -> vector<4xf32> | |
| %951 = "vector.fma"(%950, %784, %948) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %952 = "vector.extract"(%727) {position = [3]} : (vector<4xf32>) -> f32 | |
| %953 = "vector.splat"(%952) : (f32) -> vector<4xf32> | |
| %954 = "vector.fma"(%953, %786, %951) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %955 = "vector.extract"(%729) {position = [0]} : (vector<4xf32>) -> f32 | |
| %956 = "vector.splat"(%955) : (f32) -> vector<4xf32> | |
| %957 = "vector.fma"(%956, %788, %954) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %958 = "vector.extract"(%729) {position = [1]} : (vector<4xf32>) -> f32 | |
| %959 = "vector.splat"(%958) : (f32) -> vector<4xf32> | |
| %960 = "vector.fma"(%959, %790, %957) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %961 = "vector.extract"(%729) {position = [2]} : (vector<4xf32>) -> f32 | |
| %962 = "vector.splat"(%961) : (f32) -> vector<4xf32> | |
| %963 = "vector.fma"(%962, %792, %960) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %964 = "vector.extract"(%729) {position = [3]} : (vector<4xf32>) -> f32 | |
| %965 = "vector.splat"(%964) : (f32) -> vector<4xf32> | |
| %966 = "vector.fma"(%965, %794, %963) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %967 = "vector.extract"(%731) {position = [0]} : (vector<4xf32>) -> f32 | |
| %968 = "vector.splat"(%967) : (f32) -> vector<4xf32> | |
| %969 = "vector.fma"(%968, %796, %966) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %970 = "vector.extract"(%731) {position = [1]} : (vector<4xf32>) -> f32 | |
| %971 = "vector.splat"(%970) : (f32) -> vector<4xf32> | |
| %972 = "vector.fma"(%971, %798, %969) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %973 = "vector.extract"(%731) {position = [2]} : (vector<4xf32>) -> f32 | |
| %974 = "vector.splat"(%973) : (f32) -> vector<4xf32> | |
| %975 = "vector.fma"(%974, %800, %972) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %976 = "vector.extract"(%731) {position = [3]} : (vector<4xf32>) -> f32 | |
| %977 = "vector.splat"(%976) : (f32) -> vector<4xf32> | |
| %978 = "vector.fma"(%977, %802, %975) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %979 = "vector.extract"(%733) {position = [0]} : (vector<4xf32>) -> f32 | |
| %980 = "vector.splat"(%979) : (f32) -> vector<4xf32> | |
| %981 = "vector.fma"(%980, %804, %978) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %982 = "vector.extract"(%733) {position = [1]} : (vector<4xf32>) -> f32 | |
| %983 = "vector.splat"(%982) : (f32) -> vector<4xf32> | |
| %984 = "vector.fma"(%983, %806, %981) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %985 = "vector.extract"(%733) {position = [2]} : (vector<4xf32>) -> f32 | |
| %986 = "vector.splat"(%985) : (f32) -> vector<4xf32> | |
| %987 = "vector.fma"(%986, %808, %984) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %988 = "vector.extract"(%733) {position = [3]} : (vector<4xf32>) -> f32 | |
| %989 = "vector.splat"(%988) : (f32) -> vector<4xf32> | |
| %990 = "vector.fma"(%989, %810, %987) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %991 = "vector.extract"(%735) {position = [0]} : (vector<4xf32>) -> f32 | |
| %992 = "vector.splat"(%991) : (f32) -> vector<4xf32> | |
| %993 = "vector.fma"(%992, %812, %990) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %994 = "vector.extract"(%735) {position = [1]} : (vector<4xf32>) -> f32 | |
| %995 = "vector.splat"(%994) : (f32) -> vector<4xf32> | |
| %996 = "vector.fma"(%995, %814, %993) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %997 = "vector.extract"(%735) {position = [2]} : (vector<4xf32>) -> f32 | |
| %998 = "vector.splat"(%997) : (f32) -> vector<4xf32> | |
| %999 = "vector.fma"(%998, %816, %996) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1000 = "vector.extract"(%735) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1001 = "vector.splat"(%1000) : (f32) -> vector<4xf32> | |
| %1002 = "vector.fma"(%1001, %818, %999) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1003 = "vector.extract"(%737) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1004 = "vector.splat"(%1003) : (f32) -> vector<4xf32> | |
| %1005 = "vector.fma"(%1004, %820, %1002) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1006 = "vector.extract"(%737) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1007 = "vector.splat"(%1006) : (f32) -> vector<4xf32> | |
| %1008 = "vector.fma"(%1007, %822, %1005) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1009 = "vector.extract"(%737) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1010 = "vector.splat"(%1009) : (f32) -> vector<4xf32> | |
| %1011 = "vector.fma"(%1010, %824, %1008) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1012 = "vector.extract"(%737) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1013 = "vector.splat"(%1012) : (f32) -> vector<4xf32> | |
| %1014 = "vector.fma"(%1013, %826, %1011) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1015 = "vector.extract"(%739) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1016 = "vector.splat"(%1015) : (f32) -> vector<4xf32> | |
| %1017 = "vector.fma"(%1016, %828, %1014) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1018 = "vector.extract"(%739) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1019 = "vector.splat"(%1018) : (f32) -> vector<4xf32> | |
| %1020 = "vector.fma"(%1019, %830, %1017) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1021 = "vector.extract"(%739) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1022 = "vector.splat"(%1021) : (f32) -> vector<4xf32> | |
| %1023 = "vector.fma"(%1022, %832, %1020) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1024 = "vector.extract"(%739) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1025 = "vector.splat"(%1024) : (f32) -> vector<4xf32> | |
| %1026 = "vector.fma"(%1025, %834, %1023) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1027 = "vector.extract"(%741) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1028 = "vector.splat"(%1027) : (f32) -> vector<4xf32> | |
| %1029 = "vector.fma"(%1028, %772, %arg3) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1030 = "vector.extract"(%741) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1031 = "vector.splat"(%1030) : (f32) -> vector<4xf32> | |
| %1032 = "vector.fma"(%1031, %774, %1029) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1033 = "vector.extract"(%741) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1034 = "vector.splat"(%1033) : (f32) -> vector<4xf32> | |
| %1035 = "vector.fma"(%1034, %776, %1032) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1036 = "vector.extract"(%741) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1037 = "vector.splat"(%1036) : (f32) -> vector<4xf32> | |
| %1038 = "vector.fma"(%1037, %778, %1035) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1039 = "vector.extract"(%743) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1040 = "vector.splat"(%1039) : (f32) -> vector<4xf32> | |
| %1041 = "vector.fma"(%1040, %780, %1038) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1042 = "vector.extract"(%743) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1043 = "vector.splat"(%1042) : (f32) -> vector<4xf32> | |
| %1044 = "vector.fma"(%1043, %782, %1041) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1045 = "vector.extract"(%743) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1046 = "vector.splat"(%1045) : (f32) -> vector<4xf32> | |
| %1047 = "vector.fma"(%1046, %784, %1044) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1048 = "vector.extract"(%743) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1049 = "vector.splat"(%1048) : (f32) -> vector<4xf32> | |
| %1050 = "vector.fma"(%1049, %786, %1047) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1051 = "vector.extract"(%745) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1052 = "vector.splat"(%1051) : (f32) -> vector<4xf32> | |
| %1053 = "vector.fma"(%1052, %788, %1050) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1054 = "vector.extract"(%745) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1055 = "vector.splat"(%1054) : (f32) -> vector<4xf32> | |
| %1056 = "vector.fma"(%1055, %790, %1053) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1057 = "vector.extract"(%745) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1058 = "vector.splat"(%1057) : (f32) -> vector<4xf32> | |
| %1059 = "vector.fma"(%1058, %792, %1056) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1060 = "vector.extract"(%745) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1061 = "vector.splat"(%1060) : (f32) -> vector<4xf32> | |
| %1062 = "vector.fma"(%1061, %794, %1059) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1063 = "vector.extract"(%747) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1064 = "vector.splat"(%1063) : (f32) -> vector<4xf32> | |
| %1065 = "vector.fma"(%1064, %796, %1062) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1066 = "vector.extract"(%747) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1067 = "vector.splat"(%1066) : (f32) -> vector<4xf32> | |
| %1068 = "vector.fma"(%1067, %798, %1065) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1069 = "vector.extract"(%747) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1070 = "vector.splat"(%1069) : (f32) -> vector<4xf32> | |
| %1071 = "vector.fma"(%1070, %800, %1068) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1072 = "vector.extract"(%747) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1073 = "vector.splat"(%1072) : (f32) -> vector<4xf32> | |
| %1074 = "vector.fma"(%1073, %802, %1071) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1075 = "vector.extract"(%749) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1076 = "vector.splat"(%1075) : (f32) -> vector<4xf32> | |
| %1077 = "vector.fma"(%1076, %804, %1074) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1078 = "vector.extract"(%749) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1079 = "vector.splat"(%1078) : (f32) -> vector<4xf32> | |
| %1080 = "vector.fma"(%1079, %806, %1077) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1081 = "vector.extract"(%749) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1082 = "vector.splat"(%1081) : (f32) -> vector<4xf32> | |
| %1083 = "vector.fma"(%1082, %808, %1080) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1084 = "vector.extract"(%749) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1085 = "vector.splat"(%1084) : (f32) -> vector<4xf32> | |
| %1086 = "vector.fma"(%1085, %810, %1083) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1087 = "vector.extract"(%751) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1088 = "vector.splat"(%1087) : (f32) -> vector<4xf32> | |
| %1089 = "vector.fma"(%1088, %812, %1086) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1090 = "vector.extract"(%751) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1091 = "vector.splat"(%1090) : (f32) -> vector<4xf32> | |
| %1092 = "vector.fma"(%1091, %814, %1089) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1093 = "vector.extract"(%751) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1094 = "vector.splat"(%1093) : (f32) -> vector<4xf32> | |
| %1095 = "vector.fma"(%1094, %816, %1092) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1096 = "vector.extract"(%751) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1097 = "vector.splat"(%1096) : (f32) -> vector<4xf32> | |
| %1098 = "vector.fma"(%1097, %818, %1095) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1099 = "vector.extract"(%753) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1100 = "vector.splat"(%1099) : (f32) -> vector<4xf32> | |
| %1101 = "vector.fma"(%1100, %820, %1098) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1102 = "vector.extract"(%753) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1103 = "vector.splat"(%1102) : (f32) -> vector<4xf32> | |
| %1104 = "vector.fma"(%1103, %822, %1101) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1105 = "vector.extract"(%753) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1106 = "vector.splat"(%1105) : (f32) -> vector<4xf32> | |
| %1107 = "vector.fma"(%1106, %824, %1104) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1108 = "vector.extract"(%753) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1109 = "vector.splat"(%1108) : (f32) -> vector<4xf32> | |
| %1110 = "vector.fma"(%1109, %826, %1107) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1111 = "vector.extract"(%755) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1112 = "vector.splat"(%1111) : (f32) -> vector<4xf32> | |
| %1113 = "vector.fma"(%1112, %828, %1110) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1114 = "vector.extract"(%755) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1115 = "vector.splat"(%1114) : (f32) -> vector<4xf32> | |
| %1116 = "vector.fma"(%1115, %830, %1113) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1117 = "vector.extract"(%755) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1118 = "vector.splat"(%1117) : (f32) -> vector<4xf32> | |
| %1119 = "vector.fma"(%1118, %832, %1116) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1120 = "vector.extract"(%755) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1121 = "vector.splat"(%1120) : (f32) -> vector<4xf32> | |
| %1122 = "vector.fma"(%1121, %834, %1119) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1123 = "vector.extract"(%757) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1124 = "vector.splat"(%1123) : (f32) -> vector<4xf32> | |
| %1125 = "vector.fma"(%1124, %772, %arg4) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1126 = "vector.extract"(%757) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1127 = "vector.splat"(%1126) : (f32) -> vector<4xf32> | |
| %1128 = "vector.fma"(%1127, %774, %1125) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1129 = "vector.extract"(%757) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1130 = "vector.splat"(%1129) : (f32) -> vector<4xf32> | |
| %1131 = "vector.fma"(%1130, %776, %1128) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1132 = "vector.extract"(%757) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1133 = "vector.splat"(%1132) : (f32) -> vector<4xf32> | |
| %1134 = "vector.fma"(%1133, %778, %1131) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1135 = "vector.extract"(%759) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1136 = "vector.splat"(%1135) : (f32) -> vector<4xf32> | |
| %1137 = "vector.fma"(%1136, %780, %1134) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1138 = "vector.extract"(%759) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1139 = "vector.splat"(%1138) : (f32) -> vector<4xf32> | |
| %1140 = "vector.fma"(%1139, %782, %1137) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1141 = "vector.extract"(%759) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1142 = "vector.splat"(%1141) : (f32) -> vector<4xf32> | |
| %1143 = "vector.fma"(%1142, %784, %1140) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1144 = "vector.extract"(%759) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1145 = "vector.splat"(%1144) : (f32) -> vector<4xf32> | |
| %1146 = "vector.fma"(%1145, %786, %1143) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1147 = "vector.extract"(%761) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1148 = "vector.splat"(%1147) : (f32) -> vector<4xf32> | |
| %1149 = "vector.fma"(%1148, %788, %1146) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1150 = "vector.extract"(%761) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1151 = "vector.splat"(%1150) : (f32) -> vector<4xf32> | |
| %1152 = "vector.fma"(%1151, %790, %1149) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1153 = "vector.extract"(%761) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1154 = "vector.splat"(%1153) : (f32) -> vector<4xf32> | |
| %1155 = "vector.fma"(%1154, %792, %1152) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1156 = "vector.extract"(%761) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1157 = "vector.splat"(%1156) : (f32) -> vector<4xf32> | |
| %1158 = "vector.fma"(%1157, %794, %1155) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1159 = "vector.extract"(%763) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1160 = "vector.splat"(%1159) : (f32) -> vector<4xf32> | |
| %1161 = "vector.fma"(%1160, %796, %1158) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1162 = "vector.extract"(%763) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1163 = "vector.splat"(%1162) : (f32) -> vector<4xf32> | |
| %1164 = "vector.fma"(%1163, %798, %1161) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1165 = "vector.extract"(%763) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1166 = "vector.splat"(%1165) : (f32) -> vector<4xf32> | |
| %1167 = "vector.fma"(%1166, %800, %1164) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1168 = "vector.extract"(%763) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1169 = "vector.splat"(%1168) : (f32) -> vector<4xf32> | |
| %1170 = "vector.fma"(%1169, %802, %1167) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1171 = "vector.extract"(%765) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1172 = "vector.splat"(%1171) : (f32) -> vector<4xf32> | |
| %1173 = "vector.fma"(%1172, %804, %1170) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1174 = "vector.extract"(%765) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1175 = "vector.splat"(%1174) : (f32) -> vector<4xf32> | |
| %1176 = "vector.fma"(%1175, %806, %1173) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1177 = "vector.extract"(%765) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1178 = "vector.splat"(%1177) : (f32) -> vector<4xf32> | |
| %1179 = "vector.fma"(%1178, %808, %1176) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1180 = "vector.extract"(%765) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1181 = "vector.splat"(%1180) : (f32) -> vector<4xf32> | |
| %1182 = "vector.fma"(%1181, %810, %1179) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1183 = "vector.extract"(%767) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1184 = "vector.splat"(%1183) : (f32) -> vector<4xf32> | |
| %1185 = "vector.fma"(%1184, %812, %1182) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1186 = "vector.extract"(%767) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1187 = "vector.splat"(%1186) : (f32) -> vector<4xf32> | |
| %1188 = "vector.fma"(%1187, %814, %1185) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1189 = "vector.extract"(%767) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1190 = "vector.splat"(%1189) : (f32) -> vector<4xf32> | |
| %1191 = "vector.fma"(%1190, %816, %1188) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1192 = "vector.extract"(%767) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1193 = "vector.splat"(%1192) : (f32) -> vector<4xf32> | |
| %1194 = "vector.fma"(%1193, %818, %1191) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1195 = "vector.extract"(%769) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1196 = "vector.splat"(%1195) : (f32) -> vector<4xf32> | |
| %1197 = "vector.fma"(%1196, %820, %1194) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1198 = "vector.extract"(%769) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1199 = "vector.splat"(%1198) : (f32) -> vector<4xf32> | |
| %1200 = "vector.fma"(%1199, %822, %1197) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1201 = "vector.extract"(%769) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1202 = "vector.splat"(%1201) : (f32) -> vector<4xf32> | |
| %1203 = "vector.fma"(%1202, %824, %1200) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1204 = "vector.extract"(%769) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1205 = "vector.splat"(%1204) : (f32) -> vector<4xf32> | |
| %1206 = "vector.fma"(%1205, %826, %1203) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1207 = "vector.extract"(%771) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1208 = "vector.splat"(%1207) : (f32) -> vector<4xf32> | |
| %1209 = "vector.fma"(%1208, %828, %1206) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1210 = "vector.extract"(%771) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1211 = "vector.splat"(%1210) : (f32) -> vector<4xf32> | |
| %1212 = "vector.fma"(%1211, %830, %1209) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1213 = "vector.extract"(%771) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1214 = "vector.splat"(%1213) : (f32) -> vector<4xf32> | |
| %1215 = "vector.fma"(%1214, %832, %1212) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1216 = "vector.extract"(%771) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1217 = "vector.splat"(%1216) : (f32) -> vector<4xf32> | |
| %1218 = "vector.fma"(%1217, %834, %1215) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1219 = "arith.addi"(%arg0, %80) : (index, index) -> index | |
| %1220 = "arith.cmpi"(%1219, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
| %1221 = "arith.subi"(%0, %arg0) : (index, index) -> index | |
| %1222 = "arith.select"(%1220, %1221, %1219) : (i1, index, index) -> index | |
| %1223 = "arith.divsi"(%1222, %53) : (index, index) -> index | |
| %1224 = "arith.subi"(%70, %1223) : (index, index) -> index | |
| %1225 = "arith.select"(%1220, %1224, %1223) : (i1, index, index) -> index | |
| %1226 = "arith.addi"(%139, %1225) : (index, index) -> index | |
| %1227 = "arith.addi"(%1226, %146) : (index, index) -> index | |
| %1228 = "arith.addi"(%1227, %75) : (index, index) -> index | |
| %1229 = "memref.load"(%93, %1228) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %1230 = "arith.addi"(%1227, %63) : (index, index) -> index | |
| %1231 = "memref.load"(%94, %1230) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %1232 = "arith.muli"(%1219, %69) : (index, index) -> index | |
| %1233 = "arith.addi"(%1232, %81) : (index, index) -> index | |
| %1234 = "arith.addi"(%1233, %152) : (index, index) -> index | |
| %1235 = "arith.addi"(%1234, %154) : (index, index) -> index | |
| %1236 = "arith.addi"(%1235, %118) : (index, index) -> index | |
| %1237 = "arith.addi"(%1236, %160) : (index, index) -> index | |
| %1238 = "arith.addi"(%1237, %167) : (index, index) -> index | |
| %1239 = "memref.load"(%98, %1238) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %1240 = "arith.addi"(%1238, %62) : (index, index) -> index | |
| %1241 = "memref.load"(%99, %1240) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| "scf.yield"(%930, %1026, %1122, %1218, %1229, %1231, %1239, %1241) : (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> () | |
| }) : (index, index, index, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) | |
| "gpu.barrier"() : () -> () | |
| %173 = "arith.muli"(%82, %60) : (index, index) -> index | |
| %174 = "arith.addi"(%81, %173) : (index, index) -> index | |
| %175 = "arith.muli"(%83, %77) : (index, index) -> index | |
| %176 = "arith.addi"(%174, %175) : (index, index) -> index | |
| %177 = "arith.addi"(%176, %145) : (index, index) -> index | |
| "memref.store"(%172#4, %84, %177) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| %178 = "arith.addi"(%177, %77) : (index, index) -> index | |
| "memref.store"(%172#5, %84, %178) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| %179 = "arith.muli"(%82, %59) : (index, index) -> index | |
| %180 = "arith.addi"(%81, %179) : (index, index) -> index | |
| %181 = "arith.muli"(%83, %58) : (index, index) -> index | |
| %182 = "arith.addi"(%180, %181) : (index, index) -> index | |
| %183 = "arith.addi"(%182, %159) : (index, index) -> index | |
| "memref.store"(%172#6, %85, %183) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| %184 = "arith.addi"(%183, %58) : (index, index) -> index | |
| "memref.store"(%172#7, %85, %184) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| "gpu.barrier"() : () -> () | |
| %185 = "arith.muli"(%82, %57) : (index, index) -> index | |
| %186 = "memref.load"(%84, %185) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %187 = "arith.addi"(%185, %56) : (index, index) -> index | |
| %188 = "memref.load"(%84, %187) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %189 = "arith.addi"(%185, %55) : (index, index) -> index | |
| %190 = "memref.load"(%84, %189) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %191 = "arith.addi"(%185, %54) : (index, index) -> index | |
| %192 = "memref.load"(%84, %191) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %193 = "arith.addi"(%185, %53) : (index, index) -> index | |
| %194 = "memref.load"(%84, %193) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %195 = "arith.addi"(%185, %52) : (index, index) -> index | |
| %196 = "memref.load"(%84, %195) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %197 = "arith.addi"(%185, %51) : (index, index) -> index | |
| %198 = "memref.load"(%84, %197) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %199 = "arith.addi"(%185, %50) : (index, index) -> index | |
| %200 = "memref.load"(%84, %199) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %201 = "arith.addi"(%185, %49) : (index, index) -> index | |
| %202 = "memref.load"(%84, %201) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %203 = "arith.addi"(%185, %48) : (index, index) -> index | |
| %204 = "memref.load"(%84, %203) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %205 = "arith.addi"(%185, %47) : (index, index) -> index | |
| %206 = "memref.load"(%84, %205) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %207 = "arith.addi"(%185, %46) : (index, index) -> index | |
| %208 = "memref.load"(%84, %207) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %209 = "arith.addi"(%185, %45) : (index, index) -> index | |
| %210 = "memref.load"(%84, %209) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %211 = "arith.addi"(%185, %44) : (index, index) -> index | |
| %212 = "memref.load"(%84, %211) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %213 = "arith.addi"(%185, %43) : (index, index) -> index | |
| %214 = "memref.load"(%84, %213) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %215 = "arith.addi"(%185, %71) : (index, index) -> index | |
| %216 = "memref.load"(%84, %215) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %217 = "arith.addi"(%185, %60) : (index, index) -> index | |
| %218 = "memref.load"(%84, %217) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %219 = "arith.addi"(%185, %42) : (index, index) -> index | |
| %220 = "memref.load"(%84, %219) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %221 = "arith.addi"(%185, %41) : (index, index) -> index | |
| %222 = "memref.load"(%84, %221) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %223 = "arith.addi"(%185, %40) : (index, index) -> index | |
| %224 = "memref.load"(%84, %223) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %225 = "arith.addi"(%185, %39) : (index, index) -> index | |
| %226 = "memref.load"(%84, %225) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %227 = "arith.addi"(%185, %38) : (index, index) -> index | |
| %228 = "memref.load"(%84, %227) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %229 = "arith.addi"(%185, %37) : (index, index) -> index | |
| %230 = "memref.load"(%84, %229) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %231 = "arith.addi"(%185, %36) : (index, index) -> index | |
| %232 = "memref.load"(%84, %231) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %233 = "arith.addi"(%185, %35) : (index, index) -> index | |
| %234 = "memref.load"(%84, %233) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %235 = "arith.addi"(%185, %34) : (index, index) -> index | |
| %236 = "memref.load"(%84, %235) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %237 = "arith.addi"(%185, %33) : (index, index) -> index | |
| %238 = "memref.load"(%84, %237) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %239 = "arith.addi"(%185, %32) : (index, index) -> index | |
| %240 = "memref.load"(%84, %239) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %241 = "arith.addi"(%185, %31) : (index, index) -> index | |
| %242 = "memref.load"(%84, %241) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %243 = "arith.addi"(%185, %80) : (index, index) -> index | |
| %244 = "memref.load"(%84, %243) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %245 = "arith.addi"(%185, %30) : (index, index) -> index | |
| %246 = "memref.load"(%84, %245) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %247 = "arith.addi"(%185, %29) : (index, index) -> index | |
| %248 = "memref.load"(%84, %247) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %249 = "memref.load"(%85, %81) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %250 = "arith.addi"(%81, %59) : (index, index) -> index | |
| %251 = "memref.load"(%85, %250) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %252 = "arith.addi"(%81, %29) : (index, index) -> index | |
| %253 = "memref.load"(%85, %252) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %254 = "arith.addi"(%81, %28) : (index, index) -> index | |
| %255 = "memref.load"(%85, %254) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %256 = "arith.addi"(%81, %27) : (index, index) -> index | |
| %257 = "memref.load"(%85, %256) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %258 = "arith.addi"(%81, %26) : (index, index) -> index | |
| %259 = "memref.load"(%85, %258) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %260 = "arith.addi"(%81, %25) : (index, index) -> index | |
| %261 = "memref.load"(%85, %260) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %262 = "arith.addi"(%81, %24) : (index, index) -> index | |
| %263 = "memref.load"(%85, %262) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %264 = "arith.addi"(%81, %23) : (index, index) -> index | |
| %265 = "memref.load"(%85, %264) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %266 = "arith.addi"(%81, %22) : (index, index) -> index | |
| %267 = "memref.load"(%85, %266) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %268 = "arith.addi"(%81, %21) : (index, index) -> index | |
| %269 = "memref.load"(%85, %268) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %270 = "arith.addi"(%81, %20) : (index, index) -> index | |
| %271 = "memref.load"(%85, %270) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %272 = "arith.addi"(%81, %19) : (index, index) -> index | |
| %273 = "memref.load"(%85, %272) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %274 = "arith.addi"(%81, %18) : (index, index) -> index | |
| %275 = "memref.load"(%85, %274) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %276 = "arith.addi"(%81, %17) : (index, index) -> index | |
| %277 = "memref.load"(%85, %276) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %278 = "arith.addi"(%81, %16) : (index, index) -> index | |
| %279 = "memref.load"(%85, %278) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %280 = "arith.addi"(%81, %58) : (index, index) -> index | |
| %281 = "memref.load"(%85, %280) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %282 = "arith.addi"(%81, %15) : (index, index) -> index | |
| %283 = "memref.load"(%85, %282) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %284 = "arith.addi"(%81, %14) : (index, index) -> index | |
| %285 = "memref.load"(%85, %284) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %286 = "arith.addi"(%81, %13) : (index, index) -> index | |
| %287 = "memref.load"(%85, %286) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %288 = "arith.addi"(%81, %12) : (index, index) -> index | |
| %289 = "memref.load"(%85, %288) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %290 = "arith.addi"(%81, %11) : (index, index) -> index | |
| %291 = "memref.load"(%85, %290) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %292 = "arith.addi"(%81, %10) : (index, index) -> index | |
| %293 = "memref.load"(%85, %292) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %294 = "arith.addi"(%81, %9) : (index, index) -> index | |
| %295 = "memref.load"(%85, %294) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %296 = "arith.addi"(%81, %8) : (index, index) -> index | |
| %297 = "memref.load"(%85, %296) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %298 = "arith.addi"(%81, %7) : (index, index) -> index | |
| %299 = "memref.load"(%85, %298) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %300 = "arith.addi"(%81, %6) : (index, index) -> index | |
| %301 = "memref.load"(%85, %300) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %302 = "arith.addi"(%81, %5) : (index, index) -> index | |
| %303 = "memref.load"(%85, %302) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %304 = "arith.addi"(%81, %4) : (index, index) -> index | |
| %305 = "memref.load"(%85, %304) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %306 = "arith.addi"(%81, %3) : (index, index) -> index | |
| %307 = "memref.load"(%85, %306) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %308 = "arith.addi"(%81, %2) : (index, index) -> index | |
| %309 = "memref.load"(%85, %308) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %310 = "arith.addi"(%81, %1) : (index, index) -> index | |
| %311 = "memref.load"(%85, %310) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %312 = "vector.extract"(%186) {position = [0]} : (vector<4xf32>) -> f32 | |
| %313 = "vector.splat"(%312) : (f32) -> vector<4xf32> | |
| %314 = "vector.fma"(%313, %249, %172#0) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %315 = "vector.extract"(%186) {position = [1]} : (vector<4xf32>) -> f32 | |
| %316 = "vector.splat"(%315) : (f32) -> vector<4xf32> | |
| %317 = "vector.fma"(%316, %251, %314) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %318 = "vector.extract"(%186) {position = [2]} : (vector<4xf32>) -> f32 | |
| %319 = "vector.splat"(%318) : (f32) -> vector<4xf32> | |
| %320 = "vector.fma"(%319, %253, %317) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %321 = "vector.extract"(%186) {position = [3]} : (vector<4xf32>) -> f32 | |
| %322 = "vector.splat"(%321) : (f32) -> vector<4xf32> | |
| %323 = "vector.fma"(%322, %255, %320) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %324 = "vector.extract"(%188) {position = [0]} : (vector<4xf32>) -> f32 | |
| %325 = "vector.splat"(%324) : (f32) -> vector<4xf32> | |
| %326 = "vector.fma"(%325, %257, %323) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %327 = "vector.extract"(%188) {position = [1]} : (vector<4xf32>) -> f32 | |
| %328 = "vector.splat"(%327) : (f32) -> vector<4xf32> | |
| %329 = "vector.fma"(%328, %259, %326) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %330 = "vector.extract"(%188) {position = [2]} : (vector<4xf32>) -> f32 | |
| %331 = "vector.splat"(%330) : (f32) -> vector<4xf32> | |
| %332 = "vector.fma"(%331, %261, %329) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %333 = "vector.extract"(%188) {position = [3]} : (vector<4xf32>) -> f32 | |
| %334 = "vector.splat"(%333) : (f32) -> vector<4xf32> | |
| %335 = "vector.fma"(%334, %263, %332) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %336 = "vector.extract"(%190) {position = [0]} : (vector<4xf32>) -> f32 | |
| %337 = "vector.splat"(%336) : (f32) -> vector<4xf32> | |
| %338 = "vector.fma"(%337, %265, %335) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %339 = "vector.extract"(%190) {position = [1]} : (vector<4xf32>) -> f32 | |
| %340 = "vector.splat"(%339) : (f32) -> vector<4xf32> | |
| %341 = "vector.fma"(%340, %267, %338) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %342 = "vector.extract"(%190) {position = [2]} : (vector<4xf32>) -> f32 | |
| %343 = "vector.splat"(%342) : (f32) -> vector<4xf32> | |
| %344 = "vector.fma"(%343, %269, %341) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %345 = "vector.extract"(%190) {position = [3]} : (vector<4xf32>) -> f32 | |
| %346 = "vector.splat"(%345) : (f32) -> vector<4xf32> | |
| %347 = "vector.fma"(%346, %271, %344) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %348 = "vector.extract"(%192) {position = [0]} : (vector<4xf32>) -> f32 | |
| %349 = "vector.splat"(%348) : (f32) -> vector<4xf32> | |
| %350 = "vector.fma"(%349, %273, %347) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %351 = "vector.extract"(%192) {position = [1]} : (vector<4xf32>) -> f32 | |
| %352 = "vector.splat"(%351) : (f32) -> vector<4xf32> | |
| %353 = "vector.fma"(%352, %275, %350) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %354 = "vector.extract"(%192) {position = [2]} : (vector<4xf32>) -> f32 | |
| %355 = "vector.splat"(%354) : (f32) -> vector<4xf32> | |
| %356 = "vector.fma"(%355, %277, %353) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %357 = "vector.extract"(%192) {position = [3]} : (vector<4xf32>) -> f32 | |
| %358 = "vector.splat"(%357) : (f32) -> vector<4xf32> | |
| %359 = "vector.fma"(%358, %279, %356) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %360 = "vector.extract"(%194) {position = [0]} : (vector<4xf32>) -> f32 | |
| %361 = "vector.splat"(%360) : (f32) -> vector<4xf32> | |
| %362 = "vector.fma"(%361, %281, %359) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %363 = "vector.extract"(%194) {position = [1]} : (vector<4xf32>) -> f32 | |
| %364 = "vector.splat"(%363) : (f32) -> vector<4xf32> | |
| %365 = "vector.fma"(%364, %283, %362) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %366 = "vector.extract"(%194) {position = [2]} : (vector<4xf32>) -> f32 | |
| %367 = "vector.splat"(%366) : (f32) -> vector<4xf32> | |
| %368 = "vector.fma"(%367, %285, %365) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %369 = "vector.extract"(%194) {position = [3]} : (vector<4xf32>) -> f32 | |
| %370 = "vector.splat"(%369) : (f32) -> vector<4xf32> | |
| %371 = "vector.fma"(%370, %287, %368) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %372 = "vector.extract"(%196) {position = [0]} : (vector<4xf32>) -> f32 | |
| %373 = "vector.splat"(%372) : (f32) -> vector<4xf32> | |
| %374 = "vector.fma"(%373, %289, %371) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %375 = "vector.extract"(%196) {position = [1]} : (vector<4xf32>) -> f32 | |
| %376 = "vector.splat"(%375) : (f32) -> vector<4xf32> | |
| %377 = "vector.fma"(%376, %291, %374) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %378 = "vector.extract"(%196) {position = [2]} : (vector<4xf32>) -> f32 | |
| %379 = "vector.splat"(%378) : (f32) -> vector<4xf32> | |
| %380 = "vector.fma"(%379, %293, %377) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %381 = "vector.extract"(%196) {position = [3]} : (vector<4xf32>) -> f32 | |
| %382 = "vector.splat"(%381) : (f32) -> vector<4xf32> | |
| %383 = "vector.fma"(%382, %295, %380) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %384 = "vector.extract"(%198) {position = [0]} : (vector<4xf32>) -> f32 | |
| %385 = "vector.splat"(%384) : (f32) -> vector<4xf32> | |
| %386 = "vector.fma"(%385, %297, %383) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %387 = "vector.extract"(%198) {position = [1]} : (vector<4xf32>) -> f32 | |
| %388 = "vector.splat"(%387) : (f32) -> vector<4xf32> | |
| %389 = "vector.fma"(%388, %299, %386) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %390 = "vector.extract"(%198) {position = [2]} : (vector<4xf32>) -> f32 | |
| %391 = "vector.splat"(%390) : (f32) -> vector<4xf32> | |
| %392 = "vector.fma"(%391, %301, %389) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %393 = "vector.extract"(%198) {position = [3]} : (vector<4xf32>) -> f32 | |
| %394 = "vector.splat"(%393) : (f32) -> vector<4xf32> | |
| %395 = "vector.fma"(%394, %303, %392) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %396 = "vector.extract"(%200) {position = [0]} : (vector<4xf32>) -> f32 | |
| %397 = "vector.splat"(%396) : (f32) -> vector<4xf32> | |
| %398 = "vector.fma"(%397, %305, %395) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %399 = "vector.extract"(%200) {position = [1]} : (vector<4xf32>) -> f32 | |
| %400 = "vector.splat"(%399) : (f32) -> vector<4xf32> | |
| %401 = "vector.fma"(%400, %307, %398) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %402 = "vector.extract"(%200) {position = [2]} : (vector<4xf32>) -> f32 | |
| %403 = "vector.splat"(%402) : (f32) -> vector<4xf32> | |
| %404 = "vector.fma"(%403, %309, %401) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %405 = "vector.extract"(%200) {position = [3]} : (vector<4xf32>) -> f32 | |
| %406 = "vector.splat"(%405) : (f32) -> vector<4xf32> | |
| %407 = "vector.fma"(%406, %311, %404) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %408 = "vector.extract"(%202) {position = [0]} : (vector<4xf32>) -> f32 | |
| %409 = "vector.splat"(%408) : (f32) -> vector<4xf32> | |
| %410 = "vector.fma"(%409, %249, %172#1) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %411 = "vector.extract"(%202) {position = [1]} : (vector<4xf32>) -> f32 | |
| %412 = "vector.splat"(%411) : (f32) -> vector<4xf32> | |
| %413 = "vector.fma"(%412, %251, %410) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %414 = "vector.extract"(%202) {position = [2]} : (vector<4xf32>) -> f32 | |
| %415 = "vector.splat"(%414) : (f32) -> vector<4xf32> | |
| %416 = "vector.fma"(%415, %253, %413) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %417 = "vector.extract"(%202) {position = [3]} : (vector<4xf32>) -> f32 | |
| %418 = "vector.splat"(%417) : (f32) -> vector<4xf32> | |
| %419 = "vector.fma"(%418, %255, %416) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %420 = "vector.extract"(%204) {position = [0]} : (vector<4xf32>) -> f32 | |
| %421 = "vector.splat"(%420) : (f32) -> vector<4xf32> | |
| %422 = "vector.fma"(%421, %257, %419) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %423 = "vector.extract"(%204) {position = [1]} : (vector<4xf32>) -> f32 | |
| %424 = "vector.splat"(%423) : (f32) -> vector<4xf32> | |
| %425 = "vector.fma"(%424, %259, %422) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %426 = "vector.extract"(%204) {position = [2]} : (vector<4xf32>) -> f32 | |
| %427 = "vector.splat"(%426) : (f32) -> vector<4xf32> | |
| %428 = "vector.fma"(%427, %261, %425) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %429 = "vector.extract"(%204) {position = [3]} : (vector<4xf32>) -> f32 | |
| %430 = "vector.splat"(%429) : (f32) -> vector<4xf32> | |
| %431 = "vector.fma"(%430, %263, %428) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %432 = "vector.extract"(%206) {position = [0]} : (vector<4xf32>) -> f32 | |
| %433 = "vector.splat"(%432) : (f32) -> vector<4xf32> | |
| %434 = "vector.fma"(%433, %265, %431) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %435 = "vector.extract"(%206) {position = [1]} : (vector<4xf32>) -> f32 | |
| %436 = "vector.splat"(%435) : (f32) -> vector<4xf32> | |
| %437 = "vector.fma"(%436, %267, %434) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %438 = "vector.extract"(%206) {position = [2]} : (vector<4xf32>) -> f32 | |
| %439 = "vector.splat"(%438) : (f32) -> vector<4xf32> | |
| %440 = "vector.fma"(%439, %269, %437) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %441 = "vector.extract"(%206) {position = [3]} : (vector<4xf32>) -> f32 | |
| %442 = "vector.splat"(%441) : (f32) -> vector<4xf32> | |
| %443 = "vector.fma"(%442, %271, %440) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %444 = "vector.extract"(%208) {position = [0]} : (vector<4xf32>) -> f32 | |
| %445 = "vector.splat"(%444) : (f32) -> vector<4xf32> | |
| %446 = "vector.fma"(%445, %273, %443) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %447 = "vector.extract"(%208) {position = [1]} : (vector<4xf32>) -> f32 | |
| %448 = "vector.splat"(%447) : (f32) -> vector<4xf32> | |
| %449 = "vector.fma"(%448, %275, %446) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %450 = "vector.extract"(%208) {position = [2]} : (vector<4xf32>) -> f32 | |
| %451 = "vector.splat"(%450) : (f32) -> vector<4xf32> | |
| %452 = "vector.fma"(%451, %277, %449) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %453 = "vector.extract"(%208) {position = [3]} : (vector<4xf32>) -> f32 | |
| %454 = "vector.splat"(%453) : (f32) -> vector<4xf32> | |
| %455 = "vector.fma"(%454, %279, %452) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %456 = "vector.extract"(%210) {position = [0]} : (vector<4xf32>) -> f32 | |
| %457 = "vector.splat"(%456) : (f32) -> vector<4xf32> | |
| %458 = "vector.fma"(%457, %281, %455) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %459 = "vector.extract"(%210) {position = [1]} : (vector<4xf32>) -> f32 | |
| %460 = "vector.splat"(%459) : (f32) -> vector<4xf32> | |
| %461 = "vector.fma"(%460, %283, %458) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %462 = "vector.extract"(%210) {position = [2]} : (vector<4xf32>) -> f32 | |
| %463 = "vector.splat"(%462) : (f32) -> vector<4xf32> | |
| %464 = "vector.fma"(%463, %285, %461) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %465 = "vector.extract"(%210) {position = [3]} : (vector<4xf32>) -> f32 | |
| %466 = "vector.splat"(%465) : (f32) -> vector<4xf32> | |
| %467 = "vector.fma"(%466, %287, %464) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %468 = "vector.extract"(%212) {position = [0]} : (vector<4xf32>) -> f32 | |
| %469 = "vector.splat"(%468) : (f32) -> vector<4xf32> | |
| %470 = "vector.fma"(%469, %289, %467) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %471 = "vector.extract"(%212) {position = [1]} : (vector<4xf32>) -> f32 | |
| %472 = "vector.splat"(%471) : (f32) -> vector<4xf32> | |
| %473 = "vector.fma"(%472, %291, %470) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %474 = "vector.extract"(%212) {position = [2]} : (vector<4xf32>) -> f32 | |
| %475 = "vector.splat"(%474) : (f32) -> vector<4xf32> | |
| %476 = "vector.fma"(%475, %293, %473) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %477 = "vector.extract"(%212) {position = [3]} : (vector<4xf32>) -> f32 | |
| %478 = "vector.splat"(%477) : (f32) -> vector<4xf32> | |
| %479 = "vector.fma"(%478, %295, %476) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %480 = "vector.extract"(%214) {position = [0]} : (vector<4xf32>) -> f32 | |
| %481 = "vector.splat"(%480) : (f32) -> vector<4xf32> | |
| %482 = "vector.fma"(%481, %297, %479) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %483 = "vector.extract"(%214) {position = [1]} : (vector<4xf32>) -> f32 | |
| %484 = "vector.splat"(%483) : (f32) -> vector<4xf32> | |
| %485 = "vector.fma"(%484, %299, %482) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %486 = "vector.extract"(%214) {position = [2]} : (vector<4xf32>) -> f32 | |
| %487 = "vector.splat"(%486) : (f32) -> vector<4xf32> | |
| %488 = "vector.fma"(%487, %301, %485) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %489 = "vector.extract"(%214) {position = [3]} : (vector<4xf32>) -> f32 | |
| %490 = "vector.splat"(%489) : (f32) -> vector<4xf32> | |
| %491 = "vector.fma"(%490, %303, %488) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %492 = "vector.extract"(%216) {position = [0]} : (vector<4xf32>) -> f32 | |
| %493 = "vector.splat"(%492) : (f32) -> vector<4xf32> | |
| %494 = "vector.fma"(%493, %305, %491) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %495 = "vector.extract"(%216) {position = [1]} : (vector<4xf32>) -> f32 | |
| %496 = "vector.splat"(%495) : (f32) -> vector<4xf32> | |
| %497 = "vector.fma"(%496, %307, %494) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %498 = "vector.extract"(%216) {position = [2]} : (vector<4xf32>) -> f32 | |
| %499 = "vector.splat"(%498) : (f32) -> vector<4xf32> | |
| %500 = "vector.fma"(%499, %309, %497) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %501 = "vector.extract"(%216) {position = [3]} : (vector<4xf32>) -> f32 | |
| %502 = "vector.splat"(%501) : (f32) -> vector<4xf32> | |
| %503 = "vector.fma"(%502, %311, %500) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %504 = "vector.extract"(%218) {position = [0]} : (vector<4xf32>) -> f32 | |
| %505 = "vector.splat"(%504) : (f32) -> vector<4xf32> | |
| %506 = "vector.fma"(%505, %249, %172#2) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %507 = "vector.extract"(%218) {position = [1]} : (vector<4xf32>) -> f32 | |
| %508 = "vector.splat"(%507) : (f32) -> vector<4xf32> | |
| %509 = "vector.fma"(%508, %251, %506) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %510 = "vector.extract"(%218) {position = [2]} : (vector<4xf32>) -> f32 | |
| %511 = "vector.splat"(%510) : (f32) -> vector<4xf32> | |
| %512 = "vector.fma"(%511, %253, %509) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %513 = "vector.extract"(%218) {position = [3]} : (vector<4xf32>) -> f32 | |
| %514 = "vector.splat"(%513) : (f32) -> vector<4xf32> | |
| %515 = "vector.fma"(%514, %255, %512) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %516 = "vector.extract"(%220) {position = [0]} : (vector<4xf32>) -> f32 | |
| %517 = "vector.splat"(%516) : (f32) -> vector<4xf32> | |
| %518 = "vector.fma"(%517, %257, %515) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %519 = "vector.extract"(%220) {position = [1]} : (vector<4xf32>) -> f32 | |
| %520 = "vector.splat"(%519) : (f32) -> vector<4xf32> | |
| %521 = "vector.fma"(%520, %259, %518) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %522 = "vector.extract"(%220) {position = [2]} : (vector<4xf32>) -> f32 | |
| %523 = "vector.splat"(%522) : (f32) -> vector<4xf32> | |
| %524 = "vector.fma"(%523, %261, %521) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %525 = "vector.extract"(%220) {position = [3]} : (vector<4xf32>) -> f32 | |
| %526 = "vector.splat"(%525) : (f32) -> vector<4xf32> | |
| %527 = "vector.fma"(%526, %263, %524) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %528 = "vector.extract"(%222) {position = [0]} : (vector<4xf32>) -> f32 | |
| %529 = "vector.splat"(%528) : (f32) -> vector<4xf32> | |
| %530 = "vector.fma"(%529, %265, %527) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %531 = "vector.extract"(%222) {position = [1]} : (vector<4xf32>) -> f32 | |
| %532 = "vector.splat"(%531) : (f32) -> vector<4xf32> | |
| %533 = "vector.fma"(%532, %267, %530) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %534 = "vector.extract"(%222) {position = [2]} : (vector<4xf32>) -> f32 | |
| %535 = "vector.splat"(%534) : (f32) -> vector<4xf32> | |
| %536 = "vector.fma"(%535, %269, %533) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %537 = "vector.extract"(%222) {position = [3]} : (vector<4xf32>) -> f32 | |
| %538 = "vector.splat"(%537) : (f32) -> vector<4xf32> | |
| %539 = "vector.fma"(%538, %271, %536) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %540 = "vector.extract"(%224) {position = [0]} : (vector<4xf32>) -> f32 | |
| %541 = "vector.splat"(%540) : (f32) -> vector<4xf32> | |
| %542 = "vector.fma"(%541, %273, %539) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %543 = "vector.extract"(%224) {position = [1]} : (vector<4xf32>) -> f32 | |
| %544 = "vector.splat"(%543) : (f32) -> vector<4xf32> | |
| %545 = "vector.fma"(%544, %275, %542) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %546 = "vector.extract"(%224) {position = [2]} : (vector<4xf32>) -> f32 | |
| %547 = "vector.splat"(%546) : (f32) -> vector<4xf32> | |
| %548 = "vector.fma"(%547, %277, %545) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %549 = "vector.extract"(%224) {position = [3]} : (vector<4xf32>) -> f32 | |
| %550 = "vector.splat"(%549) : (f32) -> vector<4xf32> | |
| %551 = "vector.fma"(%550, %279, %548) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %552 = "vector.extract"(%226) {position = [0]} : (vector<4xf32>) -> f32 | |
| %553 = "vector.splat"(%552) : (f32) -> vector<4xf32> | |
| %554 = "vector.fma"(%553, %281, %551) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %555 = "vector.extract"(%226) {position = [1]} : (vector<4xf32>) -> f32 | |
| %556 = "vector.splat"(%555) : (f32) -> vector<4xf32> | |
| %557 = "vector.fma"(%556, %283, %554) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %558 = "vector.extract"(%226) {position = [2]} : (vector<4xf32>) -> f32 | |
| %559 = "vector.splat"(%558) : (f32) -> vector<4xf32> | |
| %560 = "vector.fma"(%559, %285, %557) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %561 = "vector.extract"(%226) {position = [3]} : (vector<4xf32>) -> f32 | |
| %562 = "vector.splat"(%561) : (f32) -> vector<4xf32> | |
| %563 = "vector.fma"(%562, %287, %560) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %564 = "vector.extract"(%228) {position = [0]} : (vector<4xf32>) -> f32 | |
| %565 = "vector.splat"(%564) : (f32) -> vector<4xf32> | |
| %566 = "vector.fma"(%565, %289, %563) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %567 = "vector.extract"(%228) {position = [1]} : (vector<4xf32>) -> f32 | |
| %568 = "vector.splat"(%567) : (f32) -> vector<4xf32> | |
| %569 = "vector.fma"(%568, %291, %566) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %570 = "vector.extract"(%228) {position = [2]} : (vector<4xf32>) -> f32 | |
| %571 = "vector.splat"(%570) : (f32) -> vector<4xf32> | |
| %572 = "vector.fma"(%571, %293, %569) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %573 = "vector.extract"(%228) {position = [3]} : (vector<4xf32>) -> f32 | |
| %574 = "vector.splat"(%573) : (f32) -> vector<4xf32> | |
| %575 = "vector.fma"(%574, %295, %572) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %576 = "vector.extract"(%230) {position = [0]} : (vector<4xf32>) -> f32 | |
| %577 = "vector.splat"(%576) : (f32) -> vector<4xf32> | |
| %578 = "vector.fma"(%577, %297, %575) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %579 = "vector.extract"(%230) {position = [1]} : (vector<4xf32>) -> f32 | |
| %580 = "vector.splat"(%579) : (f32) -> vector<4xf32> | |
| %581 = "vector.fma"(%580, %299, %578) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %582 = "vector.extract"(%230) {position = [2]} : (vector<4xf32>) -> f32 | |
| %583 = "vector.splat"(%582) : (f32) -> vector<4xf32> | |
| %584 = "vector.fma"(%583, %301, %581) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %585 = "vector.extract"(%230) {position = [3]} : (vector<4xf32>) -> f32 | |
| %586 = "vector.splat"(%585) : (f32) -> vector<4xf32> | |
| %587 = "vector.fma"(%586, %303, %584) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %588 = "vector.extract"(%232) {position = [0]} : (vector<4xf32>) -> f32 | |
| %589 = "vector.splat"(%588) : (f32) -> vector<4xf32> | |
| %590 = "vector.fma"(%589, %305, %587) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %591 = "vector.extract"(%232) {position = [1]} : (vector<4xf32>) -> f32 | |
| %592 = "vector.splat"(%591) : (f32) -> vector<4xf32> | |
| %593 = "vector.fma"(%592, %307, %590) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %594 = "vector.extract"(%232) {position = [2]} : (vector<4xf32>) -> f32 | |
| %595 = "vector.splat"(%594) : (f32) -> vector<4xf32> | |
| %596 = "vector.fma"(%595, %309, %593) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %597 = "vector.extract"(%232) {position = [3]} : (vector<4xf32>) -> f32 | |
| %598 = "vector.splat"(%597) : (f32) -> vector<4xf32> | |
| %599 = "vector.fma"(%598, %311, %596) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %600 = "vector.extract"(%234) {position = [0]} : (vector<4xf32>) -> f32 | |
| %601 = "vector.splat"(%600) : (f32) -> vector<4xf32> | |
| %602 = "vector.fma"(%601, %249, %172#3) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %603 = "vector.extract"(%234) {position = [1]} : (vector<4xf32>) -> f32 | |
| %604 = "vector.splat"(%603) : (f32) -> vector<4xf32> | |
| %605 = "vector.fma"(%604, %251, %602) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %606 = "vector.extract"(%234) {position = [2]} : (vector<4xf32>) -> f32 | |
| %607 = "vector.splat"(%606) : (f32) -> vector<4xf32> | |
| %608 = "vector.fma"(%607, %253, %605) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %609 = "vector.extract"(%234) {position = [3]} : (vector<4xf32>) -> f32 | |
| %610 = "vector.splat"(%609) : (f32) -> vector<4xf32> | |
| %611 = "vector.fma"(%610, %255, %608) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %612 = "vector.extract"(%236) {position = [0]} : (vector<4xf32>) -> f32 | |
| %613 = "vector.splat"(%612) : (f32) -> vector<4xf32> | |
| %614 = "vector.fma"(%613, %257, %611) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %615 = "vector.extract"(%236) {position = [1]} : (vector<4xf32>) -> f32 | |
| %616 = "vector.splat"(%615) : (f32) -> vector<4xf32> | |
| %617 = "vector.fma"(%616, %259, %614) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %618 = "vector.extract"(%236) {position = [2]} : (vector<4xf32>) -> f32 | |
| %619 = "vector.splat"(%618) : (f32) -> vector<4xf32> | |
| %620 = "vector.fma"(%619, %261, %617) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %621 = "vector.extract"(%236) {position = [3]} : (vector<4xf32>) -> f32 | |
| %622 = "vector.splat"(%621) : (f32) -> vector<4xf32> | |
| %623 = "vector.fma"(%622, %263, %620) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %624 = "vector.extract"(%238) {position = [0]} : (vector<4xf32>) -> f32 | |
| %625 = "vector.splat"(%624) : (f32) -> vector<4xf32> | |
| %626 = "vector.fma"(%625, %265, %623) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %627 = "vector.extract"(%238) {position = [1]} : (vector<4xf32>) -> f32 | |
| %628 = "vector.splat"(%627) : (f32) -> vector<4xf32> | |
| %629 = "vector.fma"(%628, %267, %626) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %630 = "vector.extract"(%238) {position = [2]} : (vector<4xf32>) -> f32 | |
| %631 = "vector.splat"(%630) : (f32) -> vector<4xf32> | |
| %632 = "vector.fma"(%631, %269, %629) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %633 = "vector.extract"(%238) {position = [3]} : (vector<4xf32>) -> f32 | |
| %634 = "vector.splat"(%633) : (f32) -> vector<4xf32> | |
| %635 = "vector.fma"(%634, %271, %632) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %636 = "vector.extract"(%240) {position = [0]} : (vector<4xf32>) -> f32 | |
| %637 = "vector.splat"(%636) : (f32) -> vector<4xf32> | |
| %638 = "vector.fma"(%637, %273, %635) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %639 = "vector.extract"(%240) {position = [1]} : (vector<4xf32>) -> f32 | |
| %640 = "vector.splat"(%639) : (f32) -> vector<4xf32> | |
| %641 = "vector.fma"(%640, %275, %638) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %642 = "vector.extract"(%240) {position = [2]} : (vector<4xf32>) -> f32 | |
| %643 = "vector.splat"(%642) : (f32) -> vector<4xf32> | |
| %644 = "vector.fma"(%643, %277, %641) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %645 = "vector.extract"(%240) {position = [3]} : (vector<4xf32>) -> f32 | |
| %646 = "vector.splat"(%645) : (f32) -> vector<4xf32> | |
| %647 = "vector.fma"(%646, %279, %644) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %648 = "vector.extract"(%242) {position = [0]} : (vector<4xf32>) -> f32 | |
| %649 = "vector.splat"(%648) : (f32) -> vector<4xf32> | |
| %650 = "vector.fma"(%649, %281, %647) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %651 = "vector.extract"(%242) {position = [1]} : (vector<4xf32>) -> f32 | |
| %652 = "vector.splat"(%651) : (f32) -> vector<4xf32> | |
| %653 = "vector.fma"(%652, %283, %650) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %654 = "vector.extract"(%242) {position = [2]} : (vector<4xf32>) -> f32 | |
| %655 = "vector.splat"(%654) : (f32) -> vector<4xf32> | |
| %656 = "vector.fma"(%655, %285, %653) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %657 = "vector.extract"(%242) {position = [3]} : (vector<4xf32>) -> f32 | |
| %658 = "vector.splat"(%657) : (f32) -> vector<4xf32> | |
| %659 = "vector.fma"(%658, %287, %656) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %660 = "vector.extract"(%244) {position = [0]} : (vector<4xf32>) -> f32 | |
| %661 = "vector.splat"(%660) : (f32) -> vector<4xf32> | |
| %662 = "vector.fma"(%661, %289, %659) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %663 = "vector.extract"(%244) {position = [1]} : (vector<4xf32>) -> f32 | |
| %664 = "vector.splat"(%663) : (f32) -> vector<4xf32> | |
| %665 = "vector.fma"(%664, %291, %662) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %666 = "vector.extract"(%244) {position = [2]} : (vector<4xf32>) -> f32 | |
| %667 = "vector.splat"(%666) : (f32) -> vector<4xf32> | |
| %668 = "vector.fma"(%667, %293, %665) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %669 = "vector.extract"(%244) {position = [3]} : (vector<4xf32>) -> f32 | |
| %670 = "vector.splat"(%669) : (f32) -> vector<4xf32> | |
| %671 = "vector.fma"(%670, %295, %668) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %672 = "vector.extract"(%246) {position = [0]} : (vector<4xf32>) -> f32 | |
| %673 = "vector.splat"(%672) : (f32) -> vector<4xf32> | |
| %674 = "vector.fma"(%673, %297, %671) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %675 = "vector.extract"(%246) {position = [1]} : (vector<4xf32>) -> f32 | |
| %676 = "vector.splat"(%675) : (f32) -> vector<4xf32> | |
| %677 = "vector.fma"(%676, %299, %674) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %678 = "vector.extract"(%246) {position = [2]} : (vector<4xf32>) -> f32 | |
| %679 = "vector.splat"(%678) : (f32) -> vector<4xf32> | |
| %680 = "vector.fma"(%679, %301, %677) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %681 = "vector.extract"(%246) {position = [3]} : (vector<4xf32>) -> f32 | |
| %682 = "vector.splat"(%681) : (f32) -> vector<4xf32> | |
| %683 = "vector.fma"(%682, %303, %680) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %684 = "vector.extract"(%248) {position = [0]} : (vector<4xf32>) -> f32 | |
| %685 = "vector.splat"(%684) : (f32) -> vector<4xf32> | |
| %686 = "vector.fma"(%685, %305, %683) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %687 = "vector.extract"(%248) {position = [1]} : (vector<4xf32>) -> f32 | |
| %688 = "vector.splat"(%687) : (f32) -> vector<4xf32> | |
| %689 = "vector.fma"(%688, %307, %686) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %690 = "vector.extract"(%248) {position = [2]} : (vector<4xf32>) -> f32 | |
| %691 = "vector.splat"(%690) : (f32) -> vector<4xf32> | |
| %692 = "vector.fma"(%691, %309, %689) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %693 = "vector.extract"(%248) {position = [3]} : (vector<4xf32>) -> f32 | |
| %694 = "vector.splat"(%693) : (f32) -> vector<4xf32> | |
| %695 = "vector.fma"(%694, %311, %692) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| "memref.store"(%695, %109, %130) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "memref.store"(%599, %110, %129) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "memref.store"(%503, %111, %128) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "memref.store"(%407, %112, %127) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [16, 16, 1]>, sym_name = "forward_dispatch_35_matmul_18432x320x320"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| %133 = linalg.matmul ins(%collapsed_749, %130 : tensor<18432x320xf32>, tensor<320x320xf32>) outs(%132 : tensor<18432x320xf32>) -> tensor<18432x320xf32> | |
| ^ | |
| /home/prashant/stable.mlir:1320:12: error: failed to serialize executables | |
| %133 = linalg.matmul ins(%collapsed_749, %130 : tensor<18432x320xf32>, tensor<320x320xf32>) outs(%132 : tensor<18432x320xf32>) -> tensor<18432x320xf32> | |
| ^ | |
| /home/prashant/stable.mlir:24:3: note: called from | |
| func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> { | |
| ^ | |
| /home/prashant/stable.mlir:1320:12: note: see current operation: | |
| "hal.executable"() ({ | |
| "hal.executable.variant"() ({ | |
| "hal.executable.export"() ({ | |
| ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): | |
| %0 = "arith.constant"() {value = 5 : index} : () -> index | |
| %1 = "arith.constant"() {value = 288 : index} : () -> index | |
| %2 = "arith.constant"() {value = 1 : index} : () -> index | |
| "hal.return"(%0, %1, %2) : (index, index, index) -> () | |
| }) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_35_matmul_18432x320x320", translation_info = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize pipeline_depth = 1>, workgroup_size = [16 : index, 16 : index, 1 : index]} : () -> () | |
| "builtin.module"() ({ | |
| "spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "spirv.GlobalVariable"() {binding = 2 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_2_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<vector<4xf32>, stride=16> [0])>, StorageBuffer>} : () -> () | |
| "func.func"() ({ | |
| %0 = "arith.constant"() {value = -33 : index} : () -> index | |
| %1 = "arith.constant"() {value = 527 : index} : () -> index | |
| %2 = "arith.constant"() {value = 510 : index} : () -> index | |
| %3 = "arith.constant"() {value = 493 : index} : () -> index | |
| %4 = "arith.constant"() {value = 476 : index} : () -> index | |
| %5 = "arith.constant"() {value = 459 : index} : () -> index | |
| %6 = "arith.constant"() {value = 442 : index} : () -> index | |
| %7 = "arith.constant"() {value = 425 : index} : () -> index | |
| %8 = "arith.constant"() {value = 408 : index} : () -> index | |
| %9 = "arith.constant"() {value = 391 : index} : () -> index | |
| %10 = "arith.constant"() {value = 374 : index} : () -> index | |
| %11 = "arith.constant"() {value = 357 : index} : () -> index | |
| %12 = "arith.constant"() {value = 340 : index} : () -> index | |
| %13 = "arith.constant"() {value = 323 : index} : () -> index | |
| %14 = "arith.constant"() {value = 306 : index} : () -> index | |
| %15 = "arith.constant"() {value = 289 : index} : () -> index | |
| %16 = "arith.constant"() {value = 255 : index} : () -> index | |
| %17 = "arith.constant"() {value = 238 : index} : () -> index | |
| %18 = "arith.constant"() {value = 221 : index} : () -> index | |
| %19 = "arith.constant"() {value = 204 : index} : () -> index | |
| %20 = "arith.constant"() {value = 187 : index} : () -> index | |
| %21 = "arith.constant"() {value = 170 : index} : () -> index | |
| %22 = "arith.constant"() {value = 153 : index} : () -> index | |
| %23 = "arith.constant"() {value = 136 : index} : () -> index | |
| %24 = "arith.constant"() {value = 119 : index} : () -> index | |
| %25 = "arith.constant"() {value = 102 : index} : () -> index | |
| %26 = "arith.constant"() {value = 85 : index} : () -> index | |
| %27 = "arith.constant"() {value = 68 : index} : () -> index | |
| %28 = "arith.constant"() {value = 51 : index} : () -> index | |
| %29 = "arith.constant"() {value = 34 : index} : () -> index | |
| %30 = "arith.constant"() {value = 33 : index} : () -> index | |
| %31 = "arith.constant"() {value = 31 : index} : () -> index | |
| %32 = "arith.constant"() {value = 30 : index} : () -> index | |
| %33 = "arith.constant"() {value = 29 : index} : () -> index | |
| %34 = "arith.constant"() {value = 28 : index} : () -> index | |
| %35 = "arith.constant"() {value = 27 : index} : () -> index | |
| %36 = "arith.constant"() {value = 25 : index} : () -> index | |
| %37 = "arith.constant"() {value = 24 : index} : () -> index | |
| %38 = "arith.constant"() {value = 23 : index} : () -> index | |
| %39 = "arith.constant"() {value = 22 : index} : () -> index | |
| %40 = "arith.constant"() {value = 21 : index} : () -> index | |
| %41 = "arith.constant"() {value = 20 : index} : () -> index | |
| %42 = "arith.constant"() {value = 19 : index} : () -> index | |
| %43 = "arith.constant"() {value = 15 : index} : () -> index | |
| %44 = "arith.constant"() {value = 14 : index} : () -> index | |
| %45 = "arith.constant"() {value = 13 : index} : () -> index | |
| %46 = "arith.constant"() {value = 12 : index} : () -> index | |
| %47 = "arith.constant"() {value = 11 : index} : () -> index | |
| %48 = "arith.constant"() {value = 10 : index} : () -> index | |
| %49 = "arith.constant"() {value = 9 : index} : () -> index | |
| %50 = "arith.constant"() {value = 7 : index} : () -> index | |
| %51 = "arith.constant"() {value = 6 : index} : () -> index | |
| %52 = "arith.constant"() {value = 5 : index} : () -> index | |
| %53 = "arith.constant"() {value = 4 : index} : () -> index | |
| %54 = "arith.constant"() {value = 3 : index} : () -> index | |
| %55 = "arith.constant"() {value = 2 : index} : () -> index | |
| %56 = "arith.constant"() {value = 1 : index} : () -> index | |
| %57 = "arith.constant"() {value = 36 : index} : () -> index | |
| %58 = "arith.constant"() {value = 272 : index} : () -> index | |
| %59 = "arith.constant"() {value = 17 : index} : () -> index | |
| %60 = "arith.constant"() {value = 18 : index} : () -> index | |
| %61 = "arith.constant"() {value = 64 : index} : () -> index | |
| %62 = "arith.constant"() {value = 1280 : index} : () -> index | |
| %63 = "arith.constant"() {value = 1477120 : index} : () -> index | |
| %64 = "arith.constant"() {value = 72 : index} : () -> index | |
| %65 = "arith.constant"() {value = 8 : index} : () -> index | |
| %66 = "arith.constant"() {value = 2560 : index} : () -> index | |
| %67 = "arith.constant"() {value = 240 : index} : () -> index | |
| %68 = "arith.constant"() {value = 160 : index} : () -> index | |
| %69 = "arith.constant"() {value = 80 : index} : () -> index | |
| %70 = "arith.constant"() {value = -1 : index} : () -> index | |
| %71 = "arith.constant"() {value = 16 : index} : () -> index | |
| %72 = "arith.constant"() {value = 320 : index} : () -> index | |
| %73 = "arith.constant"() {value = 5120 : index} : () -> index | |
| %74 = "arith.constant"() {value = 0 : index} : () -> index | |
| %75 = "arith.constant"() {value = 1474560 : index} : () -> index | |
| %76 = "arith.constant"() {value = 25600 : index} : () -> index | |
| %77 = "arith.constant"() {value = 288 : index} : () -> index | |
| %78 = "arith.constant"() {value = dense<0.000000e+00> : vector<4xf32>} : () -> vector<4xf32> | |
| %79 = "arith.constant"() {value = 23592960 : index} : () -> index | |
| %80 = "arith.constant"() {value = 32 : index} : () -> index | |
| %81 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index | |
| %82 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index | |
| %83 = "gpu.thread_id"() {dimension = #gpu<dim z>} : () -> index | |
| %84 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>> | |
| %85 = "memref.alloc"() {operand_segment_sizes = array<i32: 0, 0>} : () -> memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>> | |
| %86 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32 | |
| %87 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32 | |
| %88 = "arith.index_castui"(%86) : (i32) -> index | |
| %89 = "arith.index_castui"(%87) : (i32) -> index | |
| %90 = "hal.interface.binding.subspan"(%79, %75) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %91 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %92 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %93 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %94 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %95 = "hal.interface.binding.subspan"(%88, %76) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %96 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %97 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %98 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %99 = "hal.interface.binding.subspan"(%74, %76) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %100 = "hal.interface.binding.subspan"(%89, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %101 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %102 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %103 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %104 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %105 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %106 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %107 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %108 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %109 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %110 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %111 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %112 = "hal.interface.binding.subspan"(%74, %75) {alignment = 64 : index, binding = 2 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>> | |
| %113 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index | |
| %114 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index | |
| %115 = "arith.muli"(%114, %73) : (index, index) -> index | |
| %116 = "arith.muli"(%82, %72) : (index, index) -> index | |
| %117 = "arith.addi"(%115, %116) : (index, index) -> index | |
| %118 = "arith.muli"(%113, %71) : (index, index) -> index | |
| %119 = "arith.addi"(%117, %118) : (index, index) -> index | |
| %120 = "arith.addi"(%119, %81) : (index, index) -> index | |
| %121 = "arith.cmpi"(%89, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
| %122 = "arith.subi"(%70, %89) : (index, index) -> index | |
| %123 = "arith.select"(%121, %122, %89) : (i1, index, index) -> index | |
| %124 = "arith.divsi"(%123, %71) : (index, index) -> index | |
| %125 = "arith.subi"(%70, %124) : (index, index) -> index | |
| %126 = "arith.select"(%121, %125, %124) : (i1, index, index) -> index | |
| %127 = "arith.addi"(%120, %126) : (index, index) -> index | |
| "memref.store"(%78, %101, %127) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| %128 = "arith.addi"(%127, %69) : (index, index) -> index | |
| "memref.store"(%78, %102, %128) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| %129 = "arith.addi"(%127, %68) : (index, index) -> index | |
| "memref.store"(%78, %103, %129) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| %130 = "arith.addi"(%127, %67) : (index, index) -> index | |
| "memref.store"(%78, %104, %130) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| %131 = "memref.load"(%105, %127) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %132 = "memref.load"(%106, %128) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %133 = "memref.load"(%107, %129) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %134 = "memref.load"(%108, %130) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %135 = "arith.addi"(%115, %81) : (index, index) -> index | |
| %136 = "arith.muli"(%82, %68) : (index, index) -> index | |
| %137 = "arith.addi"(%135, %136) : (index, index) -> index | |
| %138 = "arith.muli"(%83, %66) : (index, index) -> index | |
| %139 = "arith.addi"(%137, %138) : (index, index) -> index | |
| %140 = "arith.cmpi"(%81, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
| %141 = "arith.subi"(%70, %81) : (index, index) -> index | |
| %142 = "arith.select"(%140, %141, %81) : (i1, index, index) -> index | |
| %143 = "arith.divsi"(%142, %65) : (index, index) -> index | |
| %144 = "arith.subi"(%70, %143) : (index, index) -> index | |
| %145 = "arith.select"(%140, %144, %143) : (i1, index, index) -> index | |
| %146 = "arith.muli"(%145, %64) : (index, index) -> index | |
| %147 = "arith.addi"(%139, %146) : (index, index) -> index | |
| %148 = "arith.addi"(%147, %75) : (index, index) -> index | |
| %149 = "memref.load"(%91, %148) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %150 = "arith.addi"(%147, %63) : (index, index) -> index | |
| %151 = "memref.load"(%92, %150) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %152 = "arith.muli"(%82, %69) : (index, index) -> index | |
| %153 = "arith.addi"(%81, %152) : (index, index) -> index | |
| %154 = "arith.muli"(%83, %62) : (index, index) -> index | |
| %155 = "arith.addi"(%153, %154) : (index, index) -> index | |
| %156 = "arith.addi"(%155, %118) : (index, index) -> index | |
| %157 = "arith.divsi"(%142, %71) : (index, index) -> index | |
| %158 = "arith.subi"(%70, %157) : (index, index) -> index | |
| %159 = "arith.select"(%140, %158, %157) : (i1, index, index) -> index | |
| %160 = "arith.muli"(%159, %61) : (index, index) -> index | |
| %161 = "arith.addi"(%156, %160) : (index, index) -> index | |
| %162 = "arith.cmpi"(%88, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
| %163 = "arith.subi"(%70, %88) : (index, index) -> index | |
| %164 = "arith.select"(%162, %163, %88) : (i1, index, index) -> index | |
| %165 = "arith.divsi"(%164, %71) : (index, index) -> index | |
| %166 = "arith.subi"(%70, %165) : (index, index) -> index | |
| %167 = "arith.select"(%162, %166, %165) : (i1, index, index) -> index | |
| %168 = "arith.addi"(%161, %167) : (index, index) -> index | |
| %169 = "memref.load"(%96, %168) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %170 = "arith.addi"(%168, %62) : (index, index) -> index | |
| %171 = "memref.load"(%97, %170) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %172:8 = "scf.for"(%74, %77, %80, %131, %132, %133, %134, %149, %151, %169, %171) ({ | |
| ^bb0(%arg0: index, %arg1: vector<4xf32>, %arg2: vector<4xf32>, %arg3: vector<4xf32>, %arg4: vector<4xf32>, %arg5: vector<4xf32>, %arg6: vector<4xf32>, %arg7: vector<4xf32>, %arg8: vector<4xf32>): | |
| "gpu.barrier"() : () -> () | |
| %696 = "arith.muli"(%82, %60) : (index, index) -> index | |
| %697 = "arith.addi"(%81, %696) : (index, index) -> index | |
| %698 = "arith.muli"(%83, %77) : (index, index) -> index | |
| %699 = "arith.addi"(%697, %698) : (index, index) -> index | |
| %700 = "arith.addi"(%699, %145) : (index, index) -> index | |
| "memref.store"(%arg5, %84, %700) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| %701 = "arith.addi"(%700, %77) : (index, index) -> index | |
| "memref.store"(%arg6, %84, %701) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| %702 = "arith.muli"(%82, %59) : (index, index) -> index | |
| %703 = "arith.addi"(%81, %702) : (index, index) -> index | |
| %704 = "arith.muli"(%83, %58) : (index, index) -> index | |
| %705 = "arith.addi"(%703, %704) : (index, index) -> index | |
| %706 = "arith.addi"(%705, %159) : (index, index) -> index | |
| "memref.store"(%arg7, %85, %706) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| %707 = "arith.addi"(%706, %58) : (index, index) -> index | |
| "memref.store"(%arg8, %85, %707) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| "gpu.barrier"() : () -> () | |
| %708 = "arith.muli"(%82, %57) : (index, index) -> index | |
| %709 = "memref.load"(%84, %708) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %710 = "arith.addi"(%708, %56) : (index, index) -> index | |
| %711 = "memref.load"(%84, %710) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %712 = "arith.addi"(%708, %55) : (index, index) -> index | |
| %713 = "memref.load"(%84, %712) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %714 = "arith.addi"(%708, %54) : (index, index) -> index | |
| %715 = "memref.load"(%84, %714) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %716 = "arith.addi"(%708, %53) : (index, index) -> index | |
| %717 = "memref.load"(%84, %716) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %718 = "arith.addi"(%708, %52) : (index, index) -> index | |
| %719 = "memref.load"(%84, %718) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %720 = "arith.addi"(%708, %51) : (index, index) -> index | |
| %721 = "memref.load"(%84, %720) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %722 = "arith.addi"(%708, %50) : (index, index) -> index | |
| %723 = "memref.load"(%84, %722) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %724 = "arith.addi"(%708, %49) : (index, index) -> index | |
| %725 = "memref.load"(%84, %724) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %726 = "arith.addi"(%708, %48) : (index, index) -> index | |
| %727 = "memref.load"(%84, %726) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %728 = "arith.addi"(%708, %47) : (index, index) -> index | |
| %729 = "memref.load"(%84, %728) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %730 = "arith.addi"(%708, %46) : (index, index) -> index | |
| %731 = "memref.load"(%84, %730) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %732 = "arith.addi"(%708, %45) : (index, index) -> index | |
| %733 = "memref.load"(%84, %732) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %734 = "arith.addi"(%708, %44) : (index, index) -> index | |
| %735 = "memref.load"(%84, %734) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %736 = "arith.addi"(%708, %43) : (index, index) -> index | |
| %737 = "memref.load"(%84, %736) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %738 = "arith.addi"(%708, %71) : (index, index) -> index | |
| %739 = "memref.load"(%84, %738) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %740 = "arith.addi"(%708, %60) : (index, index) -> index | |
| %741 = "memref.load"(%84, %740) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %742 = "arith.addi"(%708, %42) : (index, index) -> index | |
| %743 = "memref.load"(%84, %742) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %744 = "arith.addi"(%708, %41) : (index, index) -> index | |
| %745 = "memref.load"(%84, %744) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %746 = "arith.addi"(%708, %40) : (index, index) -> index | |
| %747 = "memref.load"(%84, %746) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %748 = "arith.addi"(%708, %39) : (index, index) -> index | |
| %749 = "memref.load"(%84, %748) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %750 = "arith.addi"(%708, %38) : (index, index) -> index | |
| %751 = "memref.load"(%84, %750) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %752 = "arith.addi"(%708, %37) : (index, index) -> index | |
| %753 = "memref.load"(%84, %752) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %754 = "arith.addi"(%708, %36) : (index, index) -> index | |
| %755 = "memref.load"(%84, %754) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %756 = "arith.addi"(%708, %35) : (index, index) -> index | |
| %757 = "memref.load"(%84, %756) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %758 = "arith.addi"(%708, %34) : (index, index) -> index | |
| %759 = "memref.load"(%84, %758) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %760 = "arith.addi"(%708, %33) : (index, index) -> index | |
| %761 = "memref.load"(%84, %760) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %762 = "arith.addi"(%708, %32) : (index, index) -> index | |
| %763 = "memref.load"(%84, %762) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %764 = "arith.addi"(%708, %31) : (index, index) -> index | |
| %765 = "memref.load"(%84, %764) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %766 = "arith.addi"(%708, %80) : (index, index) -> index | |
| %767 = "memref.load"(%84, %766) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %768 = "arith.addi"(%708, %30) : (index, index) -> index | |
| %769 = "memref.load"(%84, %768) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %770 = "arith.addi"(%708, %29) : (index, index) -> index | |
| %771 = "memref.load"(%84, %770) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %772 = "memref.load"(%85, %81) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %773 = "arith.addi"(%81, %59) : (index, index) -> index | |
| %774 = "memref.load"(%85, %773) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %775 = "arith.addi"(%81, %29) : (index, index) -> index | |
| %776 = "memref.load"(%85, %775) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %777 = "arith.addi"(%81, %28) : (index, index) -> index | |
| %778 = "memref.load"(%85, %777) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %779 = "arith.addi"(%81, %27) : (index, index) -> index | |
| %780 = "memref.load"(%85, %779) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %781 = "arith.addi"(%81, %26) : (index, index) -> index | |
| %782 = "memref.load"(%85, %781) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %783 = "arith.addi"(%81, %25) : (index, index) -> index | |
| %784 = "memref.load"(%85, %783) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %785 = "arith.addi"(%81, %24) : (index, index) -> index | |
| %786 = "memref.load"(%85, %785) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %787 = "arith.addi"(%81, %23) : (index, index) -> index | |
| %788 = "memref.load"(%85, %787) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %789 = "arith.addi"(%81, %22) : (index, index) -> index | |
| %790 = "memref.load"(%85, %789) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %791 = "arith.addi"(%81, %21) : (index, index) -> index | |
| %792 = "memref.load"(%85, %791) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %793 = "arith.addi"(%81, %20) : (index, index) -> index | |
| %794 = "memref.load"(%85, %793) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %795 = "arith.addi"(%81, %19) : (index, index) -> index | |
| %796 = "memref.load"(%85, %795) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %797 = "arith.addi"(%81, %18) : (index, index) -> index | |
| %798 = "memref.load"(%85, %797) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %799 = "arith.addi"(%81, %17) : (index, index) -> index | |
| %800 = "memref.load"(%85, %799) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %801 = "arith.addi"(%81, %16) : (index, index) -> index | |
| %802 = "memref.load"(%85, %801) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %803 = "arith.addi"(%81, %58) : (index, index) -> index | |
| %804 = "memref.load"(%85, %803) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %805 = "arith.addi"(%81, %15) : (index, index) -> index | |
| %806 = "memref.load"(%85, %805) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %807 = "arith.addi"(%81, %14) : (index, index) -> index | |
| %808 = "memref.load"(%85, %807) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %809 = "arith.addi"(%81, %13) : (index, index) -> index | |
| %810 = "memref.load"(%85, %809) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %811 = "arith.addi"(%81, %12) : (index, index) -> index | |
| %812 = "memref.load"(%85, %811) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %813 = "arith.addi"(%81, %11) : (index, index) -> index | |
| %814 = "memref.load"(%85, %813) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %815 = "arith.addi"(%81, %10) : (index, index) -> index | |
| %816 = "memref.load"(%85, %815) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %817 = "arith.addi"(%81, %9) : (index, index) -> index | |
| %818 = "memref.load"(%85, %817) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %819 = "arith.addi"(%81, %8) : (index, index) -> index | |
| %820 = "memref.load"(%85, %819) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %821 = "arith.addi"(%81, %7) : (index, index) -> index | |
| %822 = "memref.load"(%85, %821) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %823 = "arith.addi"(%81, %6) : (index, index) -> index | |
| %824 = "memref.load"(%85, %823) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %825 = "arith.addi"(%81, %5) : (index, index) -> index | |
| %826 = "memref.load"(%85, %825) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %827 = "arith.addi"(%81, %4) : (index, index) -> index | |
| %828 = "memref.load"(%85, %827) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %829 = "arith.addi"(%81, %3) : (index, index) -> index | |
| %830 = "memref.load"(%85, %829) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %831 = "arith.addi"(%81, %2) : (index, index) -> index | |
| %832 = "memref.load"(%85, %831) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %833 = "arith.addi"(%81, %1) : (index, index) -> index | |
| %834 = "memref.load"(%85, %833) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %835 = "vector.extract"(%709) {position = [0]} : (vector<4xf32>) -> f32 | |
| %836 = "vector.splat"(%835) : (f32) -> vector<4xf32> | |
| %837 = "vector.fma"(%836, %772, %arg1) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %838 = "vector.extract"(%709) {position = [1]} : (vector<4xf32>) -> f32 | |
| %839 = "vector.splat"(%838) : (f32) -> vector<4xf32> | |
| %840 = "vector.fma"(%839, %774, %837) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %841 = "vector.extract"(%709) {position = [2]} : (vector<4xf32>) -> f32 | |
| %842 = "vector.splat"(%841) : (f32) -> vector<4xf32> | |
| %843 = "vector.fma"(%842, %776, %840) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %844 = "vector.extract"(%709) {position = [3]} : (vector<4xf32>) -> f32 | |
| %845 = "vector.splat"(%844) : (f32) -> vector<4xf32> | |
| %846 = "vector.fma"(%845, %778, %843) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %847 = "vector.extract"(%711) {position = [0]} : (vector<4xf32>) -> f32 | |
| %848 = "vector.splat"(%847) : (f32) -> vector<4xf32> | |
| %849 = "vector.fma"(%848, %780, %846) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %850 = "vector.extract"(%711) {position = [1]} : (vector<4xf32>) -> f32 | |
| %851 = "vector.splat"(%850) : (f32) -> vector<4xf32> | |
| %852 = "vector.fma"(%851, %782, %849) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %853 = "vector.extract"(%711) {position = [2]} : (vector<4xf32>) -> f32 | |
| %854 = "vector.splat"(%853) : (f32) -> vector<4xf32> | |
| %855 = "vector.fma"(%854, %784, %852) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %856 = "vector.extract"(%711) {position = [3]} : (vector<4xf32>) -> f32 | |
| %857 = "vector.splat"(%856) : (f32) -> vector<4xf32> | |
| %858 = "vector.fma"(%857, %786, %855) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %859 = "vector.extract"(%713) {position = [0]} : (vector<4xf32>) -> f32 | |
| %860 = "vector.splat"(%859) : (f32) -> vector<4xf32> | |
| %861 = "vector.fma"(%860, %788, %858) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %862 = "vector.extract"(%713) {position = [1]} : (vector<4xf32>) -> f32 | |
| %863 = "vector.splat"(%862) : (f32) -> vector<4xf32> | |
| %864 = "vector.fma"(%863, %790, %861) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %865 = "vector.extract"(%713) {position = [2]} : (vector<4xf32>) -> f32 | |
| %866 = "vector.splat"(%865) : (f32) -> vector<4xf32> | |
| %867 = "vector.fma"(%866, %792, %864) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %868 = "vector.extract"(%713) {position = [3]} : (vector<4xf32>) -> f32 | |
| %869 = "vector.splat"(%868) : (f32) -> vector<4xf32> | |
| %870 = "vector.fma"(%869, %794, %867) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %871 = "vector.extract"(%715) {position = [0]} : (vector<4xf32>) -> f32 | |
| %872 = "vector.splat"(%871) : (f32) -> vector<4xf32> | |
| %873 = "vector.fma"(%872, %796, %870) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %874 = "vector.extract"(%715) {position = [1]} : (vector<4xf32>) -> f32 | |
| %875 = "vector.splat"(%874) : (f32) -> vector<4xf32> | |
| %876 = "vector.fma"(%875, %798, %873) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %877 = "vector.extract"(%715) {position = [2]} : (vector<4xf32>) -> f32 | |
| %878 = "vector.splat"(%877) : (f32) -> vector<4xf32> | |
| %879 = "vector.fma"(%878, %800, %876) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %880 = "vector.extract"(%715) {position = [3]} : (vector<4xf32>) -> f32 | |
| %881 = "vector.splat"(%880) : (f32) -> vector<4xf32> | |
| %882 = "vector.fma"(%881, %802, %879) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %883 = "vector.extract"(%717) {position = [0]} : (vector<4xf32>) -> f32 | |
| %884 = "vector.splat"(%883) : (f32) -> vector<4xf32> | |
| %885 = "vector.fma"(%884, %804, %882) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %886 = "vector.extract"(%717) {position = [1]} : (vector<4xf32>) -> f32 | |
| %887 = "vector.splat"(%886) : (f32) -> vector<4xf32> | |
| %888 = "vector.fma"(%887, %806, %885) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %889 = "vector.extract"(%717) {position = [2]} : (vector<4xf32>) -> f32 | |
| %890 = "vector.splat"(%889) : (f32) -> vector<4xf32> | |
| %891 = "vector.fma"(%890, %808, %888) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %892 = "vector.extract"(%717) {position = [3]} : (vector<4xf32>) -> f32 | |
| %893 = "vector.splat"(%892) : (f32) -> vector<4xf32> | |
| %894 = "vector.fma"(%893, %810, %891) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %895 = "vector.extract"(%719) {position = [0]} : (vector<4xf32>) -> f32 | |
| %896 = "vector.splat"(%895) : (f32) -> vector<4xf32> | |
| %897 = "vector.fma"(%896, %812, %894) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %898 = "vector.extract"(%719) {position = [1]} : (vector<4xf32>) -> f32 | |
| %899 = "vector.splat"(%898) : (f32) -> vector<4xf32> | |
| %900 = "vector.fma"(%899, %814, %897) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %901 = "vector.extract"(%719) {position = [2]} : (vector<4xf32>) -> f32 | |
| %902 = "vector.splat"(%901) : (f32) -> vector<4xf32> | |
| %903 = "vector.fma"(%902, %816, %900) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %904 = "vector.extract"(%719) {position = [3]} : (vector<4xf32>) -> f32 | |
| %905 = "vector.splat"(%904) : (f32) -> vector<4xf32> | |
| %906 = "vector.fma"(%905, %818, %903) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %907 = "vector.extract"(%721) {position = [0]} : (vector<4xf32>) -> f32 | |
| %908 = "vector.splat"(%907) : (f32) -> vector<4xf32> | |
| %909 = "vector.fma"(%908, %820, %906) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %910 = "vector.extract"(%721) {position = [1]} : (vector<4xf32>) -> f32 | |
| %911 = "vector.splat"(%910) : (f32) -> vector<4xf32> | |
| %912 = "vector.fma"(%911, %822, %909) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %913 = "vector.extract"(%721) {position = [2]} : (vector<4xf32>) -> f32 | |
| %914 = "vector.splat"(%913) : (f32) -> vector<4xf32> | |
| %915 = "vector.fma"(%914, %824, %912) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %916 = "vector.extract"(%721) {position = [3]} : (vector<4xf32>) -> f32 | |
| %917 = "vector.splat"(%916) : (f32) -> vector<4xf32> | |
| %918 = "vector.fma"(%917, %826, %915) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %919 = "vector.extract"(%723) {position = [0]} : (vector<4xf32>) -> f32 | |
| %920 = "vector.splat"(%919) : (f32) -> vector<4xf32> | |
| %921 = "vector.fma"(%920, %828, %918) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %922 = "vector.extract"(%723) {position = [1]} : (vector<4xf32>) -> f32 | |
| %923 = "vector.splat"(%922) : (f32) -> vector<4xf32> | |
| %924 = "vector.fma"(%923, %830, %921) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %925 = "vector.extract"(%723) {position = [2]} : (vector<4xf32>) -> f32 | |
| %926 = "vector.splat"(%925) : (f32) -> vector<4xf32> | |
| %927 = "vector.fma"(%926, %832, %924) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %928 = "vector.extract"(%723) {position = [3]} : (vector<4xf32>) -> f32 | |
| %929 = "vector.splat"(%928) : (f32) -> vector<4xf32> | |
| %930 = "vector.fma"(%929, %834, %927) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %931 = "vector.extract"(%725) {position = [0]} : (vector<4xf32>) -> f32 | |
| %932 = "vector.splat"(%931) : (f32) -> vector<4xf32> | |
| %933 = "vector.fma"(%932, %772, %arg2) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %934 = "vector.extract"(%725) {position = [1]} : (vector<4xf32>) -> f32 | |
| %935 = "vector.splat"(%934) : (f32) -> vector<4xf32> | |
| %936 = "vector.fma"(%935, %774, %933) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %937 = "vector.extract"(%725) {position = [2]} : (vector<4xf32>) -> f32 | |
| %938 = "vector.splat"(%937) : (f32) -> vector<4xf32> | |
| %939 = "vector.fma"(%938, %776, %936) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %940 = "vector.extract"(%725) {position = [3]} : (vector<4xf32>) -> f32 | |
| %941 = "vector.splat"(%940) : (f32) -> vector<4xf32> | |
| %942 = "vector.fma"(%941, %778, %939) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %943 = "vector.extract"(%727) {position = [0]} : (vector<4xf32>) -> f32 | |
| %944 = "vector.splat"(%943) : (f32) -> vector<4xf32> | |
| %945 = "vector.fma"(%944, %780, %942) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %946 = "vector.extract"(%727) {position = [1]} : (vector<4xf32>) -> f32 | |
| %947 = "vector.splat"(%946) : (f32) -> vector<4xf32> | |
| %948 = "vector.fma"(%947, %782, %945) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %949 = "vector.extract"(%727) {position = [2]} : (vector<4xf32>) -> f32 | |
| %950 = "vector.splat"(%949) : (f32) -> vector<4xf32> | |
| %951 = "vector.fma"(%950, %784, %948) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %952 = "vector.extract"(%727) {position = [3]} : (vector<4xf32>) -> f32 | |
| %953 = "vector.splat"(%952) : (f32) -> vector<4xf32> | |
| %954 = "vector.fma"(%953, %786, %951) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %955 = "vector.extract"(%729) {position = [0]} : (vector<4xf32>) -> f32 | |
| %956 = "vector.splat"(%955) : (f32) -> vector<4xf32> | |
| %957 = "vector.fma"(%956, %788, %954) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %958 = "vector.extract"(%729) {position = [1]} : (vector<4xf32>) -> f32 | |
| %959 = "vector.splat"(%958) : (f32) -> vector<4xf32> | |
| %960 = "vector.fma"(%959, %790, %957) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %961 = "vector.extract"(%729) {position = [2]} : (vector<4xf32>) -> f32 | |
| %962 = "vector.splat"(%961) : (f32) -> vector<4xf32> | |
| %963 = "vector.fma"(%962, %792, %960) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %964 = "vector.extract"(%729) {position = [3]} : (vector<4xf32>) -> f32 | |
| %965 = "vector.splat"(%964) : (f32) -> vector<4xf32> | |
| %966 = "vector.fma"(%965, %794, %963) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %967 = "vector.extract"(%731) {position = [0]} : (vector<4xf32>) -> f32 | |
| %968 = "vector.splat"(%967) : (f32) -> vector<4xf32> | |
| %969 = "vector.fma"(%968, %796, %966) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %970 = "vector.extract"(%731) {position = [1]} : (vector<4xf32>) -> f32 | |
| %971 = "vector.splat"(%970) : (f32) -> vector<4xf32> | |
| %972 = "vector.fma"(%971, %798, %969) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %973 = "vector.extract"(%731) {position = [2]} : (vector<4xf32>) -> f32 | |
| %974 = "vector.splat"(%973) : (f32) -> vector<4xf32> | |
| %975 = "vector.fma"(%974, %800, %972) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %976 = "vector.extract"(%731) {position = [3]} : (vector<4xf32>) -> f32 | |
| %977 = "vector.splat"(%976) : (f32) -> vector<4xf32> | |
| %978 = "vector.fma"(%977, %802, %975) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %979 = "vector.extract"(%733) {position = [0]} : (vector<4xf32>) -> f32 | |
| %980 = "vector.splat"(%979) : (f32) -> vector<4xf32> | |
| %981 = "vector.fma"(%980, %804, %978) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %982 = "vector.extract"(%733) {position = [1]} : (vector<4xf32>) -> f32 | |
| %983 = "vector.splat"(%982) : (f32) -> vector<4xf32> | |
| %984 = "vector.fma"(%983, %806, %981) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %985 = "vector.extract"(%733) {position = [2]} : (vector<4xf32>) -> f32 | |
| %986 = "vector.splat"(%985) : (f32) -> vector<4xf32> | |
| %987 = "vector.fma"(%986, %808, %984) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %988 = "vector.extract"(%733) {position = [3]} : (vector<4xf32>) -> f32 | |
| %989 = "vector.splat"(%988) : (f32) -> vector<4xf32> | |
| %990 = "vector.fma"(%989, %810, %987) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %991 = "vector.extract"(%735) {position = [0]} : (vector<4xf32>) -> f32 | |
| %992 = "vector.splat"(%991) : (f32) -> vector<4xf32> | |
| %993 = "vector.fma"(%992, %812, %990) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %994 = "vector.extract"(%735) {position = [1]} : (vector<4xf32>) -> f32 | |
| %995 = "vector.splat"(%994) : (f32) -> vector<4xf32> | |
| %996 = "vector.fma"(%995, %814, %993) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %997 = "vector.extract"(%735) {position = [2]} : (vector<4xf32>) -> f32 | |
| %998 = "vector.splat"(%997) : (f32) -> vector<4xf32> | |
| %999 = "vector.fma"(%998, %816, %996) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1000 = "vector.extract"(%735) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1001 = "vector.splat"(%1000) : (f32) -> vector<4xf32> | |
| %1002 = "vector.fma"(%1001, %818, %999) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1003 = "vector.extract"(%737) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1004 = "vector.splat"(%1003) : (f32) -> vector<4xf32> | |
| %1005 = "vector.fma"(%1004, %820, %1002) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1006 = "vector.extract"(%737) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1007 = "vector.splat"(%1006) : (f32) -> vector<4xf32> | |
| %1008 = "vector.fma"(%1007, %822, %1005) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1009 = "vector.extract"(%737) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1010 = "vector.splat"(%1009) : (f32) -> vector<4xf32> | |
| %1011 = "vector.fma"(%1010, %824, %1008) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1012 = "vector.extract"(%737) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1013 = "vector.splat"(%1012) : (f32) -> vector<4xf32> | |
| %1014 = "vector.fma"(%1013, %826, %1011) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1015 = "vector.extract"(%739) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1016 = "vector.splat"(%1015) : (f32) -> vector<4xf32> | |
| %1017 = "vector.fma"(%1016, %828, %1014) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1018 = "vector.extract"(%739) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1019 = "vector.splat"(%1018) : (f32) -> vector<4xf32> | |
| %1020 = "vector.fma"(%1019, %830, %1017) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1021 = "vector.extract"(%739) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1022 = "vector.splat"(%1021) : (f32) -> vector<4xf32> | |
| %1023 = "vector.fma"(%1022, %832, %1020) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1024 = "vector.extract"(%739) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1025 = "vector.splat"(%1024) : (f32) -> vector<4xf32> | |
| %1026 = "vector.fma"(%1025, %834, %1023) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1027 = "vector.extract"(%741) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1028 = "vector.splat"(%1027) : (f32) -> vector<4xf32> | |
| %1029 = "vector.fma"(%1028, %772, %arg3) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1030 = "vector.extract"(%741) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1031 = "vector.splat"(%1030) : (f32) -> vector<4xf32> | |
| %1032 = "vector.fma"(%1031, %774, %1029) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1033 = "vector.extract"(%741) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1034 = "vector.splat"(%1033) : (f32) -> vector<4xf32> | |
| %1035 = "vector.fma"(%1034, %776, %1032) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1036 = "vector.extract"(%741) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1037 = "vector.splat"(%1036) : (f32) -> vector<4xf32> | |
| %1038 = "vector.fma"(%1037, %778, %1035) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1039 = "vector.extract"(%743) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1040 = "vector.splat"(%1039) : (f32) -> vector<4xf32> | |
| %1041 = "vector.fma"(%1040, %780, %1038) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1042 = "vector.extract"(%743) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1043 = "vector.splat"(%1042) : (f32) -> vector<4xf32> | |
| %1044 = "vector.fma"(%1043, %782, %1041) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1045 = "vector.extract"(%743) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1046 = "vector.splat"(%1045) : (f32) -> vector<4xf32> | |
| %1047 = "vector.fma"(%1046, %784, %1044) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1048 = "vector.extract"(%743) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1049 = "vector.splat"(%1048) : (f32) -> vector<4xf32> | |
| %1050 = "vector.fma"(%1049, %786, %1047) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1051 = "vector.extract"(%745) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1052 = "vector.splat"(%1051) : (f32) -> vector<4xf32> | |
| %1053 = "vector.fma"(%1052, %788, %1050) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1054 = "vector.extract"(%745) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1055 = "vector.splat"(%1054) : (f32) -> vector<4xf32> | |
| %1056 = "vector.fma"(%1055, %790, %1053) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1057 = "vector.extract"(%745) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1058 = "vector.splat"(%1057) : (f32) -> vector<4xf32> | |
| %1059 = "vector.fma"(%1058, %792, %1056) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1060 = "vector.extract"(%745) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1061 = "vector.splat"(%1060) : (f32) -> vector<4xf32> | |
| %1062 = "vector.fma"(%1061, %794, %1059) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1063 = "vector.extract"(%747) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1064 = "vector.splat"(%1063) : (f32) -> vector<4xf32> | |
| %1065 = "vector.fma"(%1064, %796, %1062) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1066 = "vector.extract"(%747) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1067 = "vector.splat"(%1066) : (f32) -> vector<4xf32> | |
| %1068 = "vector.fma"(%1067, %798, %1065) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1069 = "vector.extract"(%747) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1070 = "vector.splat"(%1069) : (f32) -> vector<4xf32> | |
| %1071 = "vector.fma"(%1070, %800, %1068) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1072 = "vector.extract"(%747) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1073 = "vector.splat"(%1072) : (f32) -> vector<4xf32> | |
| %1074 = "vector.fma"(%1073, %802, %1071) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1075 = "vector.extract"(%749) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1076 = "vector.splat"(%1075) : (f32) -> vector<4xf32> | |
| %1077 = "vector.fma"(%1076, %804, %1074) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1078 = "vector.extract"(%749) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1079 = "vector.splat"(%1078) : (f32) -> vector<4xf32> | |
| %1080 = "vector.fma"(%1079, %806, %1077) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1081 = "vector.extract"(%749) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1082 = "vector.splat"(%1081) : (f32) -> vector<4xf32> | |
| %1083 = "vector.fma"(%1082, %808, %1080) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1084 = "vector.extract"(%749) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1085 = "vector.splat"(%1084) : (f32) -> vector<4xf32> | |
| %1086 = "vector.fma"(%1085, %810, %1083) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1087 = "vector.extract"(%751) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1088 = "vector.splat"(%1087) : (f32) -> vector<4xf32> | |
| %1089 = "vector.fma"(%1088, %812, %1086) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1090 = "vector.extract"(%751) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1091 = "vector.splat"(%1090) : (f32) -> vector<4xf32> | |
| %1092 = "vector.fma"(%1091, %814, %1089) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1093 = "vector.extract"(%751) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1094 = "vector.splat"(%1093) : (f32) -> vector<4xf32> | |
| %1095 = "vector.fma"(%1094, %816, %1092) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1096 = "vector.extract"(%751) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1097 = "vector.splat"(%1096) : (f32) -> vector<4xf32> | |
| %1098 = "vector.fma"(%1097, %818, %1095) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1099 = "vector.extract"(%753) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1100 = "vector.splat"(%1099) : (f32) -> vector<4xf32> | |
| %1101 = "vector.fma"(%1100, %820, %1098) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1102 = "vector.extract"(%753) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1103 = "vector.splat"(%1102) : (f32) -> vector<4xf32> | |
| %1104 = "vector.fma"(%1103, %822, %1101) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1105 = "vector.extract"(%753) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1106 = "vector.splat"(%1105) : (f32) -> vector<4xf32> | |
| %1107 = "vector.fma"(%1106, %824, %1104) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1108 = "vector.extract"(%753) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1109 = "vector.splat"(%1108) : (f32) -> vector<4xf32> | |
| %1110 = "vector.fma"(%1109, %826, %1107) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1111 = "vector.extract"(%755) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1112 = "vector.splat"(%1111) : (f32) -> vector<4xf32> | |
| %1113 = "vector.fma"(%1112, %828, %1110) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1114 = "vector.extract"(%755) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1115 = "vector.splat"(%1114) : (f32) -> vector<4xf32> | |
| %1116 = "vector.fma"(%1115, %830, %1113) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1117 = "vector.extract"(%755) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1118 = "vector.splat"(%1117) : (f32) -> vector<4xf32> | |
| %1119 = "vector.fma"(%1118, %832, %1116) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1120 = "vector.extract"(%755) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1121 = "vector.splat"(%1120) : (f32) -> vector<4xf32> | |
| %1122 = "vector.fma"(%1121, %834, %1119) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1123 = "vector.extract"(%757) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1124 = "vector.splat"(%1123) : (f32) -> vector<4xf32> | |
| %1125 = "vector.fma"(%1124, %772, %arg4) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1126 = "vector.extract"(%757) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1127 = "vector.splat"(%1126) : (f32) -> vector<4xf32> | |
| %1128 = "vector.fma"(%1127, %774, %1125) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1129 = "vector.extract"(%757) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1130 = "vector.splat"(%1129) : (f32) -> vector<4xf32> | |
| %1131 = "vector.fma"(%1130, %776, %1128) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1132 = "vector.extract"(%757) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1133 = "vector.splat"(%1132) : (f32) -> vector<4xf32> | |
| %1134 = "vector.fma"(%1133, %778, %1131) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1135 = "vector.extract"(%759) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1136 = "vector.splat"(%1135) : (f32) -> vector<4xf32> | |
| %1137 = "vector.fma"(%1136, %780, %1134) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1138 = "vector.extract"(%759) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1139 = "vector.splat"(%1138) : (f32) -> vector<4xf32> | |
| %1140 = "vector.fma"(%1139, %782, %1137) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1141 = "vector.extract"(%759) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1142 = "vector.splat"(%1141) : (f32) -> vector<4xf32> | |
| %1143 = "vector.fma"(%1142, %784, %1140) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1144 = "vector.extract"(%759) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1145 = "vector.splat"(%1144) : (f32) -> vector<4xf32> | |
| %1146 = "vector.fma"(%1145, %786, %1143) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1147 = "vector.extract"(%761) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1148 = "vector.splat"(%1147) : (f32) -> vector<4xf32> | |
| %1149 = "vector.fma"(%1148, %788, %1146) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1150 = "vector.extract"(%761) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1151 = "vector.splat"(%1150) : (f32) -> vector<4xf32> | |
| %1152 = "vector.fma"(%1151, %790, %1149) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1153 = "vector.extract"(%761) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1154 = "vector.splat"(%1153) : (f32) -> vector<4xf32> | |
| %1155 = "vector.fma"(%1154, %792, %1152) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1156 = "vector.extract"(%761) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1157 = "vector.splat"(%1156) : (f32) -> vector<4xf32> | |
| %1158 = "vector.fma"(%1157, %794, %1155) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1159 = "vector.extract"(%763) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1160 = "vector.splat"(%1159) : (f32) -> vector<4xf32> | |
| %1161 = "vector.fma"(%1160, %796, %1158) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1162 = "vector.extract"(%763) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1163 = "vector.splat"(%1162) : (f32) -> vector<4xf32> | |
| %1164 = "vector.fma"(%1163, %798, %1161) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1165 = "vector.extract"(%763) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1166 = "vector.splat"(%1165) : (f32) -> vector<4xf32> | |
| %1167 = "vector.fma"(%1166, %800, %1164) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1168 = "vector.extract"(%763) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1169 = "vector.splat"(%1168) : (f32) -> vector<4xf32> | |
| %1170 = "vector.fma"(%1169, %802, %1167) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1171 = "vector.extract"(%765) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1172 = "vector.splat"(%1171) : (f32) -> vector<4xf32> | |
| %1173 = "vector.fma"(%1172, %804, %1170) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1174 = "vector.extract"(%765) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1175 = "vector.splat"(%1174) : (f32) -> vector<4xf32> | |
| %1176 = "vector.fma"(%1175, %806, %1173) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1177 = "vector.extract"(%765) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1178 = "vector.splat"(%1177) : (f32) -> vector<4xf32> | |
| %1179 = "vector.fma"(%1178, %808, %1176) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1180 = "vector.extract"(%765) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1181 = "vector.splat"(%1180) : (f32) -> vector<4xf32> | |
| %1182 = "vector.fma"(%1181, %810, %1179) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1183 = "vector.extract"(%767) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1184 = "vector.splat"(%1183) : (f32) -> vector<4xf32> | |
| %1185 = "vector.fma"(%1184, %812, %1182) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1186 = "vector.extract"(%767) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1187 = "vector.splat"(%1186) : (f32) -> vector<4xf32> | |
| %1188 = "vector.fma"(%1187, %814, %1185) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1189 = "vector.extract"(%767) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1190 = "vector.splat"(%1189) : (f32) -> vector<4xf32> | |
| %1191 = "vector.fma"(%1190, %816, %1188) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1192 = "vector.extract"(%767) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1193 = "vector.splat"(%1192) : (f32) -> vector<4xf32> | |
| %1194 = "vector.fma"(%1193, %818, %1191) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1195 = "vector.extract"(%769) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1196 = "vector.splat"(%1195) : (f32) -> vector<4xf32> | |
| %1197 = "vector.fma"(%1196, %820, %1194) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1198 = "vector.extract"(%769) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1199 = "vector.splat"(%1198) : (f32) -> vector<4xf32> | |
| %1200 = "vector.fma"(%1199, %822, %1197) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1201 = "vector.extract"(%769) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1202 = "vector.splat"(%1201) : (f32) -> vector<4xf32> | |
| %1203 = "vector.fma"(%1202, %824, %1200) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1204 = "vector.extract"(%769) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1205 = "vector.splat"(%1204) : (f32) -> vector<4xf32> | |
| %1206 = "vector.fma"(%1205, %826, %1203) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1207 = "vector.extract"(%771) {position = [0]} : (vector<4xf32>) -> f32 | |
| %1208 = "vector.splat"(%1207) : (f32) -> vector<4xf32> | |
| %1209 = "vector.fma"(%1208, %828, %1206) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1210 = "vector.extract"(%771) {position = [1]} : (vector<4xf32>) -> f32 | |
| %1211 = "vector.splat"(%1210) : (f32) -> vector<4xf32> | |
| %1212 = "vector.fma"(%1211, %830, %1209) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1213 = "vector.extract"(%771) {position = [2]} : (vector<4xf32>) -> f32 | |
| %1214 = "vector.splat"(%1213) : (f32) -> vector<4xf32> | |
| %1215 = "vector.fma"(%1214, %832, %1212) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1216 = "vector.extract"(%771) {position = [3]} : (vector<4xf32>) -> f32 | |
| %1217 = "vector.splat"(%1216) : (f32) -> vector<4xf32> | |
| %1218 = "vector.fma"(%1217, %834, %1215) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %1219 = "arith.addi"(%arg0, %80) : (index, index) -> index | |
| %1220 = "arith.cmpi"(%1219, %74) {predicate = 2 : i64} : (index, index) -> i1 | |
| %1221 = "arith.subi"(%0, %arg0) : (index, index) -> index | |
| %1222 = "arith.select"(%1220, %1221, %1219) : (i1, index, index) -> index | |
| %1223 = "arith.divsi"(%1222, %53) : (index, index) -> index | |
| %1224 = "arith.subi"(%70, %1223) : (index, index) -> index | |
| %1225 = "arith.select"(%1220, %1224, %1223) : (i1, index, index) -> index | |
| %1226 = "arith.addi"(%139, %1225) : (index, index) -> index | |
| %1227 = "arith.addi"(%1226, %146) : (index, index) -> index | |
| %1228 = "arith.addi"(%1227, %75) : (index, index) -> index | |
| %1229 = "memref.load"(%93, %1228) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %1230 = "arith.addi"(%1227, %63) : (index, index) -> index | |
| %1231 = "memref.load"(%94, %1230) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %1232 = "arith.muli"(%1219, %69) : (index, index) -> index | |
| %1233 = "arith.addi"(%1232, %81) : (index, index) -> index | |
| %1234 = "arith.addi"(%1233, %152) : (index, index) -> index | |
| %1235 = "arith.addi"(%1234, %154) : (index, index) -> index | |
| %1236 = "arith.addi"(%1235, %118) : (index, index) -> index | |
| %1237 = "arith.addi"(%1236, %160) : (index, index) -> index | |
| %1238 = "arith.addi"(%1237, %167) : (index, index) -> index | |
| %1239 = "memref.load"(%98, %1238) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| %1240 = "arith.addi"(%1238, %62) : (index, index) -> index | |
| %1241 = "memref.load"(%99, %1240) : (memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> vector<4xf32> | |
| "scf.yield"(%930, %1026, %1122, %1218, %1229, %1231, %1239, %1241) : (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> () | |
| }) : (index, index, index, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) | |
| "gpu.barrier"() : () -> () | |
| %173 = "arith.muli"(%82, %60) : (index, index) -> index | |
| %174 = "arith.addi"(%81, %173) : (index, index) -> index | |
| %175 = "arith.muli"(%83, %77) : (index, index) -> index | |
| %176 = "arith.addi"(%174, %175) : (index, index) -> index | |
| %177 = "arith.addi"(%176, %145) : (index, index) -> index | |
| "memref.store"(%172#4, %84, %177) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| %178 = "arith.addi"(%177, %77) : (index, index) -> index | |
| "memref.store"(%172#5, %84, %178) {nontemporal = false} : (vector<4xf32>, memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| %179 = "arith.muli"(%82, %59) : (index, index) -> index | |
| %180 = "arith.addi"(%81, %179) : (index, index) -> index | |
| %181 = "arith.muli"(%83, %58) : (index, index) -> index | |
| %182 = "arith.addi"(%180, %181) : (index, index) -> index | |
| %183 = "arith.addi"(%182, %159) : (index, index) -> index | |
| "memref.store"(%172#6, %85, %183) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| %184 = "arith.addi"(%183, %58) : (index, index) -> index | |
| "memref.store"(%172#7, %85, %184) {nontemporal = false} : (vector<4xf32>, memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> () | |
| "gpu.barrier"() : () -> () | |
| %185 = "arith.muli"(%82, %57) : (index, index) -> index | |
| %186 = "memref.load"(%84, %185) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %187 = "arith.addi"(%185, %56) : (index, index) -> index | |
| %188 = "memref.load"(%84, %187) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %189 = "arith.addi"(%185, %55) : (index, index) -> index | |
| %190 = "memref.load"(%84, %189) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %191 = "arith.addi"(%185, %54) : (index, index) -> index | |
| %192 = "memref.load"(%84, %191) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %193 = "arith.addi"(%185, %53) : (index, index) -> index | |
| %194 = "memref.load"(%84, %193) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %195 = "arith.addi"(%185, %52) : (index, index) -> index | |
| %196 = "memref.load"(%84, %195) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %197 = "arith.addi"(%185, %51) : (index, index) -> index | |
| %198 = "memref.load"(%84, %197) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %199 = "arith.addi"(%185, %50) : (index, index) -> index | |
| %200 = "memref.load"(%84, %199) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %201 = "arith.addi"(%185, %49) : (index, index) -> index | |
| %202 = "memref.load"(%84, %201) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %203 = "arith.addi"(%185, %48) : (index, index) -> index | |
| %204 = "memref.load"(%84, %203) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %205 = "arith.addi"(%185, %47) : (index, index) -> index | |
| %206 = "memref.load"(%84, %205) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %207 = "arith.addi"(%185, %46) : (index, index) -> index | |
| %208 = "memref.load"(%84, %207) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %209 = "arith.addi"(%185, %45) : (index, index) -> index | |
| %210 = "memref.load"(%84, %209) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %211 = "arith.addi"(%185, %44) : (index, index) -> index | |
| %212 = "memref.load"(%84, %211) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %213 = "arith.addi"(%185, %43) : (index, index) -> index | |
| %214 = "memref.load"(%84, %213) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %215 = "arith.addi"(%185, %71) : (index, index) -> index | |
| %216 = "memref.load"(%84, %215) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %217 = "arith.addi"(%185, %60) : (index, index) -> index | |
| %218 = "memref.load"(%84, %217) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %219 = "arith.addi"(%185, %42) : (index, index) -> index | |
| %220 = "memref.load"(%84, %219) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %221 = "arith.addi"(%185, %41) : (index, index) -> index | |
| %222 = "memref.load"(%84, %221) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %223 = "arith.addi"(%185, %40) : (index, index) -> index | |
| %224 = "memref.load"(%84, %223) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %225 = "arith.addi"(%185, %39) : (index, index) -> index | |
| %226 = "memref.load"(%84, %225) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %227 = "arith.addi"(%185, %38) : (index, index) -> index | |
| %228 = "memref.load"(%84, %227) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %229 = "arith.addi"(%185, %37) : (index, index) -> index | |
| %230 = "memref.load"(%84, %229) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %231 = "arith.addi"(%185, %36) : (index, index) -> index | |
| %232 = "memref.load"(%84, %231) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %233 = "arith.addi"(%185, %35) : (index, index) -> index | |
| %234 = "memref.load"(%84, %233) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %235 = "arith.addi"(%185, %34) : (index, index) -> index | |
| %236 = "memref.load"(%84, %235) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %237 = "arith.addi"(%185, %33) : (index, index) -> index | |
| %238 = "memref.load"(%84, %237) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %239 = "arith.addi"(%185, %32) : (index, index) -> index | |
| %240 = "memref.load"(%84, %239) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %241 = "arith.addi"(%185, %31) : (index, index) -> index | |
| %242 = "memref.load"(%84, %241) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %243 = "arith.addi"(%185, %80) : (index, index) -> index | |
| %244 = "memref.load"(%84, %243) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %245 = "arith.addi"(%185, %30) : (index, index) -> index | |
| %246 = "memref.load"(%84, %245) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %247 = "arith.addi"(%185, %29) : (index, index) -> index | |
| %248 = "memref.load"(%84, %247) {nontemporal = false} : (memref<576xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %249 = "memref.load"(%85, %81) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %250 = "arith.addi"(%81, %59) : (index, index) -> index | |
| %251 = "memref.load"(%85, %250) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %252 = "arith.addi"(%81, %29) : (index, index) -> index | |
| %253 = "memref.load"(%85, %252) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %254 = "arith.addi"(%81, %28) : (index, index) -> index | |
| %255 = "memref.load"(%85, %254) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %256 = "arith.addi"(%81, %27) : (index, index) -> index | |
| %257 = "memref.load"(%85, %256) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %258 = "arith.addi"(%81, %26) : (index, index) -> index | |
| %259 = "memref.load"(%85, %258) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %260 = "arith.addi"(%81, %25) : (index, index) -> index | |
| %261 = "memref.load"(%85, %260) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %262 = "arith.addi"(%81, %24) : (index, index) -> index | |
| %263 = "memref.load"(%85, %262) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %264 = "arith.addi"(%81, %23) : (index, index) -> index | |
| %265 = "memref.load"(%85, %264) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %266 = "arith.addi"(%81, %22) : (index, index) -> index | |
| %267 = "memref.load"(%85, %266) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %268 = "arith.addi"(%81, %21) : (index, index) -> index | |
| %269 = "memref.load"(%85, %268) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %270 = "arith.addi"(%81, %20) : (index, index) -> index | |
| %271 = "memref.load"(%85, %270) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %272 = "arith.addi"(%81, %19) : (index, index) -> index | |
| %273 = "memref.load"(%85, %272) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %274 = "arith.addi"(%81, %18) : (index, index) -> index | |
| %275 = "memref.load"(%85, %274) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %276 = "arith.addi"(%81, %17) : (index, index) -> index | |
| %277 = "memref.load"(%85, %276) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %278 = "arith.addi"(%81, %16) : (index, index) -> index | |
| %279 = "memref.load"(%85, %278) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %280 = "arith.addi"(%81, %58) : (index, index) -> index | |
| %281 = "memref.load"(%85, %280) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %282 = "arith.addi"(%81, %15) : (index, index) -> index | |
| %283 = "memref.load"(%85, %282) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %284 = "arith.addi"(%81, %14) : (index, index) -> index | |
| %285 = "memref.load"(%85, %284) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %286 = "arith.addi"(%81, %13) : (index, index) -> index | |
| %287 = "memref.load"(%85, %286) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %288 = "arith.addi"(%81, %12) : (index, index) -> index | |
| %289 = "memref.load"(%85, %288) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %290 = "arith.addi"(%81, %11) : (index, index) -> index | |
| %291 = "memref.load"(%85, %290) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %292 = "arith.addi"(%81, %10) : (index, index) -> index | |
| %293 = "memref.load"(%85, %292) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %294 = "arith.addi"(%81, %9) : (index, index) -> index | |
| %295 = "memref.load"(%85, %294) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %296 = "arith.addi"(%81, %8) : (index, index) -> index | |
| %297 = "memref.load"(%85, %296) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %298 = "arith.addi"(%81, %7) : (index, index) -> index | |
| %299 = "memref.load"(%85, %298) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %300 = "arith.addi"(%81, %6) : (index, index) -> index | |
| %301 = "memref.load"(%85, %300) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %302 = "arith.addi"(%81, %5) : (index, index) -> index | |
| %303 = "memref.load"(%85, %302) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %304 = "arith.addi"(%81, %4) : (index, index) -> index | |
| %305 = "memref.load"(%85, %304) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %306 = "arith.addi"(%81, %3) : (index, index) -> index | |
| %307 = "memref.load"(%85, %306) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %308 = "arith.addi"(%81, %2) : (index, index) -> index | |
| %309 = "memref.load"(%85, %308) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %310 = "arith.addi"(%81, %1) : (index, index) -> index | |
| %311 = "memref.load"(%85, %310) {nontemporal = false} : (memref<544xvector<4xf32>, #spirv.storage_class<Workgroup>>, index) -> vector<4xf32> | |
| %312 = "vector.extract"(%186) {position = [0]} : (vector<4xf32>) -> f32 | |
| %313 = "vector.splat"(%312) : (f32) -> vector<4xf32> | |
| %314 = "vector.fma"(%313, %249, %172#0) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %315 = "vector.extract"(%186) {position = [1]} : (vector<4xf32>) -> f32 | |
| %316 = "vector.splat"(%315) : (f32) -> vector<4xf32> | |
| %317 = "vector.fma"(%316, %251, %314) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %318 = "vector.extract"(%186) {position = [2]} : (vector<4xf32>) -> f32 | |
| %319 = "vector.splat"(%318) : (f32) -> vector<4xf32> | |
| %320 = "vector.fma"(%319, %253, %317) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %321 = "vector.extract"(%186) {position = [3]} : (vector<4xf32>) -> f32 | |
| %322 = "vector.splat"(%321) : (f32) -> vector<4xf32> | |
| %323 = "vector.fma"(%322, %255, %320) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %324 = "vector.extract"(%188) {position = [0]} : (vector<4xf32>) -> f32 | |
| %325 = "vector.splat"(%324) : (f32) -> vector<4xf32> | |
| %326 = "vector.fma"(%325, %257, %323) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %327 = "vector.extract"(%188) {position = [1]} : (vector<4xf32>) -> f32 | |
| %328 = "vector.splat"(%327) : (f32) -> vector<4xf32> | |
| %329 = "vector.fma"(%328, %259, %326) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %330 = "vector.extract"(%188) {position = [2]} : (vector<4xf32>) -> f32 | |
| %331 = "vector.splat"(%330) : (f32) -> vector<4xf32> | |
| %332 = "vector.fma"(%331, %261, %329) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %333 = "vector.extract"(%188) {position = [3]} : (vector<4xf32>) -> f32 | |
| %334 = "vector.splat"(%333) : (f32) -> vector<4xf32> | |
| %335 = "vector.fma"(%334, %263, %332) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %336 = "vector.extract"(%190) {position = [0]} : (vector<4xf32>) -> f32 | |
| %337 = "vector.splat"(%336) : (f32) -> vector<4xf32> | |
| %338 = "vector.fma"(%337, %265, %335) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %339 = "vector.extract"(%190) {position = [1]} : (vector<4xf32>) -> f32 | |
| %340 = "vector.splat"(%339) : (f32) -> vector<4xf32> | |
| %341 = "vector.fma"(%340, %267, %338) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %342 = "vector.extract"(%190) {position = [2]} : (vector<4xf32>) -> f32 | |
| %343 = "vector.splat"(%342) : (f32) -> vector<4xf32> | |
| %344 = "vector.fma"(%343, %269, %341) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %345 = "vector.extract"(%190) {position = [3]} : (vector<4xf32>) -> f32 | |
| %346 = "vector.splat"(%345) : (f32) -> vector<4xf32> | |
| %347 = "vector.fma"(%346, %271, %344) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %348 = "vector.extract"(%192) {position = [0]} : (vector<4xf32>) -> f32 | |
| %349 = "vector.splat"(%348) : (f32) -> vector<4xf32> | |
| %350 = "vector.fma"(%349, %273, %347) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %351 = "vector.extract"(%192) {position = [1]} : (vector<4xf32>) -> f32 | |
| %352 = "vector.splat"(%351) : (f32) -> vector<4xf32> | |
| %353 = "vector.fma"(%352, %275, %350) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %354 = "vector.extract"(%192) {position = [2]} : (vector<4xf32>) -> f32 | |
| %355 = "vector.splat"(%354) : (f32) -> vector<4xf32> | |
| %356 = "vector.fma"(%355, %277, %353) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %357 = "vector.extract"(%192) {position = [3]} : (vector<4xf32>) -> f32 | |
| %358 = "vector.splat"(%357) : (f32) -> vector<4xf32> | |
| %359 = "vector.fma"(%358, %279, %356) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %360 = "vector.extract"(%194) {position = [0]} : (vector<4xf32>) -> f32 | |
| %361 = "vector.splat"(%360) : (f32) -> vector<4xf32> | |
| %362 = "vector.fma"(%361, %281, %359) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %363 = "vector.extract"(%194) {position = [1]} : (vector<4xf32>) -> f32 | |
| %364 = "vector.splat"(%363) : (f32) -> vector<4xf32> | |
| %365 = "vector.fma"(%364, %283, %362) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %366 = "vector.extract"(%194) {position = [2]} : (vector<4xf32>) -> f32 | |
| %367 = "vector.splat"(%366) : (f32) -> vector<4xf32> | |
| %368 = "vector.fma"(%367, %285, %365) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %369 = "vector.extract"(%194) {position = [3]} : (vector<4xf32>) -> f32 | |
| %370 = "vector.splat"(%369) : (f32) -> vector<4xf32> | |
| %371 = "vector.fma"(%370, %287, %368) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %372 = "vector.extract"(%196) {position = [0]} : (vector<4xf32>) -> f32 | |
| %373 = "vector.splat"(%372) : (f32) -> vector<4xf32> | |
| %374 = "vector.fma"(%373, %289, %371) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %375 = "vector.extract"(%196) {position = [1]} : (vector<4xf32>) -> f32 | |
| %376 = "vector.splat"(%375) : (f32) -> vector<4xf32> | |
| %377 = "vector.fma"(%376, %291, %374) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %378 = "vector.extract"(%196) {position = [2]} : (vector<4xf32>) -> f32 | |
| %379 = "vector.splat"(%378) : (f32) -> vector<4xf32> | |
| %380 = "vector.fma"(%379, %293, %377) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %381 = "vector.extract"(%196) {position = [3]} : (vector<4xf32>) -> f32 | |
| %382 = "vector.splat"(%381) : (f32) -> vector<4xf32> | |
| %383 = "vector.fma"(%382, %295, %380) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %384 = "vector.extract"(%198) {position = [0]} : (vector<4xf32>) -> f32 | |
| %385 = "vector.splat"(%384) : (f32) -> vector<4xf32> | |
| %386 = "vector.fma"(%385, %297, %383) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %387 = "vector.extract"(%198) {position = [1]} : (vector<4xf32>) -> f32 | |
| %388 = "vector.splat"(%387) : (f32) -> vector<4xf32> | |
| %389 = "vector.fma"(%388, %299, %386) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %390 = "vector.extract"(%198) {position = [2]} : (vector<4xf32>) -> f32 | |
| %391 = "vector.splat"(%390) : (f32) -> vector<4xf32> | |
| %392 = "vector.fma"(%391, %301, %389) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %393 = "vector.extract"(%198) {position = [3]} : (vector<4xf32>) -> f32 | |
| %394 = "vector.splat"(%393) : (f32) -> vector<4xf32> | |
| %395 = "vector.fma"(%394, %303, %392) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %396 = "vector.extract"(%200) {position = [0]} : (vector<4xf32>) -> f32 | |
| %397 = "vector.splat"(%396) : (f32) -> vector<4xf32> | |
| %398 = "vector.fma"(%397, %305, %395) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %399 = "vector.extract"(%200) {position = [1]} : (vector<4xf32>) -> f32 | |
| %400 = "vector.splat"(%399) : (f32) -> vector<4xf32> | |
| %401 = "vector.fma"(%400, %307, %398) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %402 = "vector.extract"(%200) {position = [2]} : (vector<4xf32>) -> f32 | |
| %403 = "vector.splat"(%402) : (f32) -> vector<4xf32> | |
| %404 = "vector.fma"(%403, %309, %401) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %405 = "vector.extract"(%200) {position = [3]} : (vector<4xf32>) -> f32 | |
| %406 = "vector.splat"(%405) : (f32) -> vector<4xf32> | |
| %407 = "vector.fma"(%406, %311, %404) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %408 = "vector.extract"(%202) {position = [0]} : (vector<4xf32>) -> f32 | |
| %409 = "vector.splat"(%408) : (f32) -> vector<4xf32> | |
| %410 = "vector.fma"(%409, %249, %172#1) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %411 = "vector.extract"(%202) {position = [1]} : (vector<4xf32>) -> f32 | |
| %412 = "vector.splat"(%411) : (f32) -> vector<4xf32> | |
| %413 = "vector.fma"(%412, %251, %410) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %414 = "vector.extract"(%202) {position = [2]} : (vector<4xf32>) -> f32 | |
| %415 = "vector.splat"(%414) : (f32) -> vector<4xf32> | |
| %416 = "vector.fma"(%415, %253, %413) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %417 = "vector.extract"(%202) {position = [3]} : (vector<4xf32>) -> f32 | |
| %418 = "vector.splat"(%417) : (f32) -> vector<4xf32> | |
| %419 = "vector.fma"(%418, %255, %416) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %420 = "vector.extract"(%204) {position = [0]} : (vector<4xf32>) -> f32 | |
| %421 = "vector.splat"(%420) : (f32) -> vector<4xf32> | |
| %422 = "vector.fma"(%421, %257, %419) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %423 = "vector.extract"(%204) {position = [1]} : (vector<4xf32>) -> f32 | |
| %424 = "vector.splat"(%423) : (f32) -> vector<4xf32> | |
| %425 = "vector.fma"(%424, %259, %422) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %426 = "vector.extract"(%204) {position = [2]} : (vector<4xf32>) -> f32 | |
| %427 = "vector.splat"(%426) : (f32) -> vector<4xf32> | |
| %428 = "vector.fma"(%427, %261, %425) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %429 = "vector.extract"(%204) {position = [3]} : (vector<4xf32>) -> f32 | |
| %430 = "vector.splat"(%429) : (f32) -> vector<4xf32> | |
| %431 = "vector.fma"(%430, %263, %428) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %432 = "vector.extract"(%206) {position = [0]} : (vector<4xf32>) -> f32 | |
| %433 = "vector.splat"(%432) : (f32) -> vector<4xf32> | |
| %434 = "vector.fma"(%433, %265, %431) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %435 = "vector.extract"(%206) {position = [1]} : (vector<4xf32>) -> f32 | |
| %436 = "vector.splat"(%435) : (f32) -> vector<4xf32> | |
| %437 = "vector.fma"(%436, %267, %434) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %438 = "vector.extract"(%206) {position = [2]} : (vector<4xf32>) -> f32 | |
| %439 = "vector.splat"(%438) : (f32) -> vector<4xf32> | |
| %440 = "vector.fma"(%439, %269, %437) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %441 = "vector.extract"(%206) {position = [3]} : (vector<4xf32>) -> f32 | |
| %442 = "vector.splat"(%441) : (f32) -> vector<4xf32> | |
| %443 = "vector.fma"(%442, %271, %440) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %444 = "vector.extract"(%208) {position = [0]} : (vector<4xf32>) -> f32 | |
| %445 = "vector.splat"(%444) : (f32) -> vector<4xf32> | |
| %446 = "vector.fma"(%445, %273, %443) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %447 = "vector.extract"(%208) {position = [1]} : (vector<4xf32>) -> f32 | |
| %448 = "vector.splat"(%447) : (f32) -> vector<4xf32> | |
| %449 = "vector.fma"(%448, %275, %446) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %450 = "vector.extract"(%208) {position = [2]} : (vector<4xf32>) -> f32 | |
| %451 = "vector.splat"(%450) : (f32) -> vector<4xf32> | |
| %452 = "vector.fma"(%451, %277, %449) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %453 = "vector.extract"(%208) {position = [3]} : (vector<4xf32>) -> f32 | |
| %454 = "vector.splat"(%453) : (f32) -> vector<4xf32> | |
| %455 = "vector.fma"(%454, %279, %452) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %456 = "vector.extract"(%210) {position = [0]} : (vector<4xf32>) -> f32 | |
| %457 = "vector.splat"(%456) : (f32) -> vector<4xf32> | |
| %458 = "vector.fma"(%457, %281, %455) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %459 = "vector.extract"(%210) {position = [1]} : (vector<4xf32>) -> f32 | |
| %460 = "vector.splat"(%459) : (f32) -> vector<4xf32> | |
| %461 = "vector.fma"(%460, %283, %458) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %462 = "vector.extract"(%210) {position = [2]} : (vector<4xf32>) -> f32 | |
| %463 = "vector.splat"(%462) : (f32) -> vector<4xf32> | |
| %464 = "vector.fma"(%463, %285, %461) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %465 = "vector.extract"(%210) {position = [3]} : (vector<4xf32>) -> f32 | |
| %466 = "vector.splat"(%465) : (f32) -> vector<4xf32> | |
| %467 = "vector.fma"(%466, %287, %464) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %468 = "vector.extract"(%212) {position = [0]} : (vector<4xf32>) -> f32 | |
| %469 = "vector.splat"(%468) : (f32) -> vector<4xf32> | |
| %470 = "vector.fma"(%469, %289, %467) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %471 = "vector.extract"(%212) {position = [1]} : (vector<4xf32>) -> f32 | |
| %472 = "vector.splat"(%471) : (f32) -> vector<4xf32> | |
| %473 = "vector.fma"(%472, %291, %470) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %474 = "vector.extract"(%212) {position = [2]} : (vector<4xf32>) -> f32 | |
| %475 = "vector.splat"(%474) : (f32) -> vector<4xf32> | |
| %476 = "vector.fma"(%475, %293, %473) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %477 = "vector.extract"(%212) {position = [3]} : (vector<4xf32>) -> f32 | |
| %478 = "vector.splat"(%477) : (f32) -> vector<4xf32> | |
| %479 = "vector.fma"(%478, %295, %476) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %480 = "vector.extract"(%214) {position = [0]} : (vector<4xf32>) -> f32 | |
| %481 = "vector.splat"(%480) : (f32) -> vector<4xf32> | |
| %482 = "vector.fma"(%481, %297, %479) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %483 = "vector.extract"(%214) {position = [1]} : (vector<4xf32>) -> f32 | |
| %484 = "vector.splat"(%483) : (f32) -> vector<4xf32> | |
| %485 = "vector.fma"(%484, %299, %482) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %486 = "vector.extract"(%214) {position = [2]} : (vector<4xf32>) -> f32 | |
| %487 = "vector.splat"(%486) : (f32) -> vector<4xf32> | |
| %488 = "vector.fma"(%487, %301, %485) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %489 = "vector.extract"(%214) {position = [3]} : (vector<4xf32>) -> f32 | |
| %490 = "vector.splat"(%489) : (f32) -> vector<4xf32> | |
| %491 = "vector.fma"(%490, %303, %488) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %492 = "vector.extract"(%216) {position = [0]} : (vector<4xf32>) -> f32 | |
| %493 = "vector.splat"(%492) : (f32) -> vector<4xf32> | |
| %494 = "vector.fma"(%493, %305, %491) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %495 = "vector.extract"(%216) {position = [1]} : (vector<4xf32>) -> f32 | |
| %496 = "vector.splat"(%495) : (f32) -> vector<4xf32> | |
| %497 = "vector.fma"(%496, %307, %494) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %498 = "vector.extract"(%216) {position = [2]} : (vector<4xf32>) -> f32 | |
| %499 = "vector.splat"(%498) : (f32) -> vector<4xf32> | |
| %500 = "vector.fma"(%499, %309, %497) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %501 = "vector.extract"(%216) {position = [3]} : (vector<4xf32>) -> f32 | |
| %502 = "vector.splat"(%501) : (f32) -> vector<4xf32> | |
| %503 = "vector.fma"(%502, %311, %500) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %504 = "vector.extract"(%218) {position = [0]} : (vector<4xf32>) -> f32 | |
| %505 = "vector.splat"(%504) : (f32) -> vector<4xf32> | |
| %506 = "vector.fma"(%505, %249, %172#2) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %507 = "vector.extract"(%218) {position = [1]} : (vector<4xf32>) -> f32 | |
| %508 = "vector.splat"(%507) : (f32) -> vector<4xf32> | |
| %509 = "vector.fma"(%508, %251, %506) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %510 = "vector.extract"(%218) {position = [2]} : (vector<4xf32>) -> f32 | |
| %511 = "vector.splat"(%510) : (f32) -> vector<4xf32> | |
| %512 = "vector.fma"(%511, %253, %509) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %513 = "vector.extract"(%218) {position = [3]} : (vector<4xf32>) -> f32 | |
| %514 = "vector.splat"(%513) : (f32) -> vector<4xf32> | |
| %515 = "vector.fma"(%514, %255, %512) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %516 = "vector.extract"(%220) {position = [0]} : (vector<4xf32>) -> f32 | |
| %517 = "vector.splat"(%516) : (f32) -> vector<4xf32> | |
| %518 = "vector.fma"(%517, %257, %515) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %519 = "vector.extract"(%220) {position = [1]} : (vector<4xf32>) -> f32 | |
| %520 = "vector.splat"(%519) : (f32) -> vector<4xf32> | |
| %521 = "vector.fma"(%520, %259, %518) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %522 = "vector.extract"(%220) {position = [2]} : (vector<4xf32>) -> f32 | |
| %523 = "vector.splat"(%522) : (f32) -> vector<4xf32> | |
| %524 = "vector.fma"(%523, %261, %521) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %525 = "vector.extract"(%220) {position = [3]} : (vector<4xf32>) -> f32 | |
| %526 = "vector.splat"(%525) : (f32) -> vector<4xf32> | |
| %527 = "vector.fma"(%526, %263, %524) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %528 = "vector.extract"(%222) {position = [0]} : (vector<4xf32>) -> f32 | |
| %529 = "vector.splat"(%528) : (f32) -> vector<4xf32> | |
| %530 = "vector.fma"(%529, %265, %527) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %531 = "vector.extract"(%222) {position = [1]} : (vector<4xf32>) -> f32 | |
| %532 = "vector.splat"(%531) : (f32) -> vector<4xf32> | |
| %533 = "vector.fma"(%532, %267, %530) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %534 = "vector.extract"(%222) {position = [2]} : (vector<4xf32>) -> f32 | |
| %535 = "vector.splat"(%534) : (f32) -> vector<4xf32> | |
| %536 = "vector.fma"(%535, %269, %533) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %537 = "vector.extract"(%222) {position = [3]} : (vector<4xf32>) -> f32 | |
| %538 = "vector.splat"(%537) : (f32) -> vector<4xf32> | |
| %539 = "vector.fma"(%538, %271, %536) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %540 = "vector.extract"(%224) {position = [0]} : (vector<4xf32>) -> f32 | |
| %541 = "vector.splat"(%540) : (f32) -> vector<4xf32> | |
| %542 = "vector.fma"(%541, %273, %539) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %543 = "vector.extract"(%224) {position = [1]} : (vector<4xf32>) -> f32 | |
| %544 = "vector.splat"(%543) : (f32) -> vector<4xf32> | |
| %545 = "vector.fma"(%544, %275, %542) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %546 = "vector.extract"(%224) {position = [2]} : (vector<4xf32>) -> f32 | |
| %547 = "vector.splat"(%546) : (f32) -> vector<4xf32> | |
| %548 = "vector.fma"(%547, %277, %545) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %549 = "vector.extract"(%224) {position = [3]} : (vector<4xf32>) -> f32 | |
| %550 = "vector.splat"(%549) : (f32) -> vector<4xf32> | |
| %551 = "vector.fma"(%550, %279, %548) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %552 = "vector.extract"(%226) {position = [0]} : (vector<4xf32>) -> f32 | |
| %553 = "vector.splat"(%552) : (f32) -> vector<4xf32> | |
| %554 = "vector.fma"(%553, %281, %551) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %555 = "vector.extract"(%226) {position = [1]} : (vector<4xf32>) -> f32 | |
| %556 = "vector.splat"(%555) : (f32) -> vector<4xf32> | |
| %557 = "vector.fma"(%556, %283, %554) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %558 = "vector.extract"(%226) {position = [2]} : (vector<4xf32>) -> f32 | |
| %559 = "vector.splat"(%558) : (f32) -> vector<4xf32> | |
| %560 = "vector.fma"(%559, %285, %557) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %561 = "vector.extract"(%226) {position = [3]} : (vector<4xf32>) -> f32 | |
| %562 = "vector.splat"(%561) : (f32) -> vector<4xf32> | |
| %563 = "vector.fma"(%562, %287, %560) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %564 = "vector.extract"(%228) {position = [0]} : (vector<4xf32>) -> f32 | |
| %565 = "vector.splat"(%564) : (f32) -> vector<4xf32> | |
| %566 = "vector.fma"(%565, %289, %563) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %567 = "vector.extract"(%228) {position = [1]} : (vector<4xf32>) -> f32 | |
| %568 = "vector.splat"(%567) : (f32) -> vector<4xf32> | |
| %569 = "vector.fma"(%568, %291, %566) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %570 = "vector.extract"(%228) {position = [2]} : (vector<4xf32>) -> f32 | |
| %571 = "vector.splat"(%570) : (f32) -> vector<4xf32> | |
| %572 = "vector.fma"(%571, %293, %569) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %573 = "vector.extract"(%228) {position = [3]} : (vector<4xf32>) -> f32 | |
| %574 = "vector.splat"(%573) : (f32) -> vector<4xf32> | |
| %575 = "vector.fma"(%574, %295, %572) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %576 = "vector.extract"(%230) {position = [0]} : (vector<4xf32>) -> f32 | |
| %577 = "vector.splat"(%576) : (f32) -> vector<4xf32> | |
| %578 = "vector.fma"(%577, %297, %575) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %579 = "vector.extract"(%230) {position = [1]} : (vector<4xf32>) -> f32 | |
| %580 = "vector.splat"(%579) : (f32) -> vector<4xf32> | |
| %581 = "vector.fma"(%580, %299, %578) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %582 = "vector.extract"(%230) {position = [2]} : (vector<4xf32>) -> f32 | |
| %583 = "vector.splat"(%582) : (f32) -> vector<4xf32> | |
| %584 = "vector.fma"(%583, %301, %581) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %585 = "vector.extract"(%230) {position = [3]} : (vector<4xf32>) -> f32 | |
| %586 = "vector.splat"(%585) : (f32) -> vector<4xf32> | |
| %587 = "vector.fma"(%586, %303, %584) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %588 = "vector.extract"(%232) {position = [0]} : (vector<4xf32>) -> f32 | |
| %589 = "vector.splat"(%588) : (f32) -> vector<4xf32> | |
| %590 = "vector.fma"(%589, %305, %587) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %591 = "vector.extract"(%232) {position = [1]} : (vector<4xf32>) -> f32 | |
| %592 = "vector.splat"(%591) : (f32) -> vector<4xf32> | |
| %593 = "vector.fma"(%592, %307, %590) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %594 = "vector.extract"(%232) {position = [2]} : (vector<4xf32>) -> f32 | |
| %595 = "vector.splat"(%594) : (f32) -> vector<4xf32> | |
| %596 = "vector.fma"(%595, %309, %593) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %597 = "vector.extract"(%232) {position = [3]} : (vector<4xf32>) -> f32 | |
| %598 = "vector.splat"(%597) : (f32) -> vector<4xf32> | |
| %599 = "vector.fma"(%598, %311, %596) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %600 = "vector.extract"(%234) {position = [0]} : (vector<4xf32>) -> f32 | |
| %601 = "vector.splat"(%600) : (f32) -> vector<4xf32> | |
| %602 = "vector.fma"(%601, %249, %172#3) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %603 = "vector.extract"(%234) {position = [1]} : (vector<4xf32>) -> f32 | |
| %604 = "vector.splat"(%603) : (f32) -> vector<4xf32> | |
| %605 = "vector.fma"(%604, %251, %602) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %606 = "vector.extract"(%234) {position = [2]} : (vector<4xf32>) -> f32 | |
| %607 = "vector.splat"(%606) : (f32) -> vector<4xf32> | |
| %608 = "vector.fma"(%607, %253, %605) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %609 = "vector.extract"(%234) {position = [3]} : (vector<4xf32>) -> f32 | |
| %610 = "vector.splat"(%609) : (f32) -> vector<4xf32> | |
| %611 = "vector.fma"(%610, %255, %608) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %612 = "vector.extract"(%236) {position = [0]} : (vector<4xf32>) -> f32 | |
| %613 = "vector.splat"(%612) : (f32) -> vector<4xf32> | |
| %614 = "vector.fma"(%613, %257, %611) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %615 = "vector.extract"(%236) {position = [1]} : (vector<4xf32>) -> f32 | |
| %616 = "vector.splat"(%615) : (f32) -> vector<4xf32> | |
| %617 = "vector.fma"(%616, %259, %614) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %618 = "vector.extract"(%236) {position = [2]} : (vector<4xf32>) -> f32 | |
| %619 = "vector.splat"(%618) : (f32) -> vector<4xf32> | |
| %620 = "vector.fma"(%619, %261, %617) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %621 = "vector.extract"(%236) {position = [3]} : (vector<4xf32>) -> f32 | |
| %622 = "vector.splat"(%621) : (f32) -> vector<4xf32> | |
| %623 = "vector.fma"(%622, %263, %620) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %624 = "vector.extract"(%238) {position = [0]} : (vector<4xf32>) -> f32 | |
| %625 = "vector.splat"(%624) : (f32) -> vector<4xf32> | |
| %626 = "vector.fma"(%625, %265, %623) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %627 = "vector.extract"(%238) {position = [1]} : (vector<4xf32>) -> f32 | |
| %628 = "vector.splat"(%627) : (f32) -> vector<4xf32> | |
| %629 = "vector.fma"(%628, %267, %626) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %630 = "vector.extract"(%238) {position = [2]} : (vector<4xf32>) -> f32 | |
| %631 = "vector.splat"(%630) : (f32) -> vector<4xf32> | |
| %632 = "vector.fma"(%631, %269, %629) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %633 = "vector.extract"(%238) {position = [3]} : (vector<4xf32>) -> f32 | |
| %634 = "vector.splat"(%633) : (f32) -> vector<4xf32> | |
| %635 = "vector.fma"(%634, %271, %632) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %636 = "vector.extract"(%240) {position = [0]} : (vector<4xf32>) -> f32 | |
| %637 = "vector.splat"(%636) : (f32) -> vector<4xf32> | |
| %638 = "vector.fma"(%637, %273, %635) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %639 = "vector.extract"(%240) {position = [1]} : (vector<4xf32>) -> f32 | |
| %640 = "vector.splat"(%639) : (f32) -> vector<4xf32> | |
| %641 = "vector.fma"(%640, %275, %638) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %642 = "vector.extract"(%240) {position = [2]} : (vector<4xf32>) -> f32 | |
| %643 = "vector.splat"(%642) : (f32) -> vector<4xf32> | |
| %644 = "vector.fma"(%643, %277, %641) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %645 = "vector.extract"(%240) {position = [3]} : (vector<4xf32>) -> f32 | |
| %646 = "vector.splat"(%645) : (f32) -> vector<4xf32> | |
| %647 = "vector.fma"(%646, %279, %644) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %648 = "vector.extract"(%242) {position = [0]} : (vector<4xf32>) -> f32 | |
| %649 = "vector.splat"(%648) : (f32) -> vector<4xf32> | |
| %650 = "vector.fma"(%649, %281, %647) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %651 = "vector.extract"(%242) {position = [1]} : (vector<4xf32>) -> f32 | |
| %652 = "vector.splat"(%651) : (f32) -> vector<4xf32> | |
| %653 = "vector.fma"(%652, %283, %650) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %654 = "vector.extract"(%242) {position = [2]} : (vector<4xf32>) -> f32 | |
| %655 = "vector.splat"(%654) : (f32) -> vector<4xf32> | |
| %656 = "vector.fma"(%655, %285, %653) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %657 = "vector.extract"(%242) {position = [3]} : (vector<4xf32>) -> f32 | |
| %658 = "vector.splat"(%657) : (f32) -> vector<4xf32> | |
| %659 = "vector.fma"(%658, %287, %656) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %660 = "vector.extract"(%244) {position = [0]} : (vector<4xf32>) -> f32 | |
| %661 = "vector.splat"(%660) : (f32) -> vector<4xf32> | |
| %662 = "vector.fma"(%661, %289, %659) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %663 = "vector.extract"(%244) {position = [1]} : (vector<4xf32>) -> f32 | |
| %664 = "vector.splat"(%663) : (f32) -> vector<4xf32> | |
| %665 = "vector.fma"(%664, %291, %662) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %666 = "vector.extract"(%244) {position = [2]} : (vector<4xf32>) -> f32 | |
| %667 = "vector.splat"(%666) : (f32) -> vector<4xf32> | |
| %668 = "vector.fma"(%667, %293, %665) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %669 = "vector.extract"(%244) {position = [3]} : (vector<4xf32>) -> f32 | |
| %670 = "vector.splat"(%669) : (f32) -> vector<4xf32> | |
| %671 = "vector.fma"(%670, %295, %668) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %672 = "vector.extract"(%246) {position = [0]} : (vector<4xf32>) -> f32 | |
| %673 = "vector.splat"(%672) : (f32) -> vector<4xf32> | |
| %674 = "vector.fma"(%673, %297, %671) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %675 = "vector.extract"(%246) {position = [1]} : (vector<4xf32>) -> f32 | |
| %676 = "vector.splat"(%675) : (f32) -> vector<4xf32> | |
| %677 = "vector.fma"(%676, %299, %674) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %678 = "vector.extract"(%246) {position = [2]} : (vector<4xf32>) -> f32 | |
| %679 = "vector.splat"(%678) : (f32) -> vector<4xf32> | |
| %680 = "vector.fma"(%679, %301, %677) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %681 = "vector.extract"(%246) {position = [3]} : (vector<4xf32>) -> f32 | |
| %682 = "vector.splat"(%681) : (f32) -> vector<4xf32> | |
| %683 = "vector.fma"(%682, %303, %680) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %684 = "vector.extract"(%248) {position = [0]} : (vector<4xf32>) -> f32 | |
| %685 = "vector.splat"(%684) : (f32) -> vector<4xf32> | |
| %686 = "vector.fma"(%685, %305, %683) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %687 = "vector.extract"(%248) {position = [1]} : (vector<4xf32>) -> f32 | |
| %688 = "vector.splat"(%687) : (f32) -> vector<4xf32> | |
| %689 = "vector.fma"(%688, %307, %686) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %690 = "vector.extract"(%248) {position = [2]} : (vector<4xf32>) -> f32 | |
| %691 = "vector.splat"(%690) : (f32) -> vector<4xf32> | |
| %692 = "vector.fma"(%691, %309, %689) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| %693 = "vector.extract"(%248) {position = [3]} : (vector<4xf32>) -> f32 | |
| %694 = "vector.splat"(%693) : (f32) -> vector<4xf32> | |
| %695 = "vector.fma"(%694, %311, %692) : (vector<4xf32>, vector<4xf32>, vector<4xf32>) -> vector<4xf32> | |
| "memref.store"(%695, %109, %130) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "memref.store"(%599, %110, %129) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "memref.store"(%503, %111, %128) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "memref.store"(%407, %112, %127) : (vector<4xf32>, memref<?xvector<4xf32>, #spirv.storage_class<StorageBuffer>>, index) -> () | |
| "func.return"() : () -> () | |
| }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [16, 16, 1]>, sym_name = "forward_dispatch_35_matmul_18432x320x320"} : () -> () | |
| }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> () | |
| "hal.executable.variant_end"() : () -> () | |
| }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () | |
| "hal.executable_end"() : () -> () | |
| }) {sym_name = "forward_dispatch_35", sym_visibility = "private"} : () -> () | |
| %133 = linalg.matmul ins(%collapsed_749, %130 : tensor<18432x320xf32>, tensor<320x320xf32>) outs(%132 : tensor<18432x320xf32>) -> tensor<18432x320xf32> | |
| ^ | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment