pashu123 · February 10, 2023 15:33
diff --git a/err_dump.txt b/err_dump.txt
 /home/prashant/stable.mlir:793:11: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion
    %10 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x160xf32>) outs(%8 : tensor<2x160xf32>) {
          ^
 /home/prashant/stable.mlir:24:3: note: called from
  func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> {
  ^
 /home/prashant/stable.mlir:793:11: note: see current operation: %20 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32
    %10 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x160xf32>) outs(%8 : tensor<2x160xf32>) {
          ^
 /home/prashant/stable.mlir:793:11: note: see existing live user here: %28 = "spirv.UConvert"(%20) : (i32) -> i64
 /home/prashant/stable.mlir:804:27: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>
    %inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32>
                          ^
 /home/prashant/stable.mlir:24:3: note: called from
  func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> {
  ^
 /home/prashant/stable.mlir:804:27: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
    %0 = "arith.constant"() {value = 5 : index} : () -> index
    %1 = "arith.constant"() {value = 2 : index} : () -> index
    %2 = "arith.constant"() {value = 1 : index} : () -> index
    "hal.return"(%0, %1, %2) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_3", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> ()
  "builtin.module"() ({
    "spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> ()
    "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> ()
    "func.func"() ({
      %0 = "arith.constant"() {value = -1 : index} : () -> index
      %1 = "arith.constant"() {value = 4 : index} : () -> index
      %2 = "arith.constant"() {value = 32 : index} : () -> index
      %3 = "arith.constant"() {value = 160 : index} : () -> index
      %4 = "arith.constant"() {value = 0 : index} : () -> index
      %5 = "arith.constant"() {value = 640 : index} : () -> index
      %6 = "arith.constant"() {value = 320 : index} : () -> index
      %7 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32
      %8 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32
      %9 = "arith.index_castui"(%7) : (i32) -> index
      %10 = "arith.index_castui"(%8) : (i32) -> index
      %11 = "hal.interface.binding.subspan"(%9, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
      %12 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
      %13 = "hal.interface.binding.subspan"(%10, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
      %14 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
      %15 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
      %16 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
      %17 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index
      %18 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index
      %19 = "arith.muli"(%16, %3) : (index, index) -> index
      %20 = "arith.muli"(%18, %3) : (index, index) -> index
      %21 = "arith.addi"(%19, %20) : (index, index) -> index
      %22 = "arith.muli"(%15, %2) : (index, index) -> index
      %23 = "arith.addi"(%21, %22) : (index, index) -> index
      %24 = "arith.addi"(%23, %17) : (index, index) -> index
      %25 = "arith.cmpi"(%9, %4) {predicate = 2 : i64} : (index, index) -> i1
      %26 = "arith.subi"(%0, %9) : (index, index) -> index
      %27 = "arith.select"(%25, %26, %9) : (i1, index, index) -> index
      %28 = "arith.divsi"(%27, %1) : (index, index) -> index
      %29 = "arith.subi"(%0, %28) : (index, index) -> index
      %30 = "arith.select"(%25, %29, %28) : (i1, index, index) -> index
      %31 = "arith.addi"(%24, %30) : (index, index) -> index
      %32 = "memref.load"(%12, %31) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32
      %33 = "arith.muli"(%16, %6) : (index, index) -> index
      %34 = "arith.muli"(%18, %6) : (index, index) -> index
      %35 = "arith.addi"(%33, %34) : (index, index) -> index
      %36 = "arith.addi"(%35, %22) : (index, index) -> index
      %37 = "arith.addi"(%36, %17) : (index, index) -> index
      %38 = "arith.cmpi"(%10, %4) {predicate = 2 : i64} : (index, index) -> i1
      %39 = "arith.subi"(%0, %10) : (index, index) -> index
      %40 = "arith.select"(%38, %39, %10) : (i1, index, index) -> index
      %41 = "arith.divsi"(%40, %1) : (index, index) -> index
      %42 = "arith.subi"(%0, %41) : (index, index) -> index
      %43 = "arith.select"(%38, %42, %41) : (i1, index, index) -> index
      %44 = "arith.addi"(%37, %43) : (index, index) -> index
      "memref.store"(%32, %14, %44) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> ()
      "func.return"() : () -> ()
    }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_3"} : () -> ()
  }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> ()
    %inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32>
                          ^
 /home/prashant/stable.mlir:804:27: error: failed to serialize executables
    %inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32>
                          ^
 /home/prashant/stable.mlir:24:3: note: called from
  func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> {
  ^
 /home/prashant/stable.mlir:804:27: note: see current operation: 
 "hal.executable"() ({
  "hal.executable.variant"() ({
    "hal.executable.export"() ({
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
      %0 = "arith.constant"() {value = 5 : index} : () -> index
      %1 = "arith.constant"() {value = 2 : index} : () -> index
      %2 = "arith.constant"() {value = 1 : index} : () -> index
      "hal.return"(%0, %1, %2) : (index, index, index) -> ()
    }) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_3", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> ()
    "builtin.module"() ({
      "spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> ()
      "spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> ()
      "func.func"() ({
        %0 = "arith.constant"() {value = -1 : index} : () -> index
        %1 = "arith.constant"() {value = 4 : index} : () -> index
        %2 = "arith.constant"() {value = 32 : index} : () -> index
        %3 = "arith.constant"() {value = 160 : index} : () -> index
        %4 = "arith.constant"() {value = 0 : index} : () -> index
        %5 = "arith.constant"() {value = 640 : index} : () -> index
        %6 = "arith.constant"() {value = 320 : index} : () -> index
        %7 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32
        %8 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32
        %9 = "arith.index_castui"(%7) : (i32) -> index
        %10 = "arith.index_castui"(%8) : (i32) -> index
        %11 = "hal.interface.binding.subspan"(%9, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
        %12 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
        %13 = "hal.interface.binding.subspan"(%10, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
        %14 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
        %15 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
        %16 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
        %17 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index
        %18 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index
        %19 = "arith.muli"(%16, %3) : (index, index) -> index
        %20 = "arith.muli"(%18, %3) : (index, index) -> index
        %21 = "arith.addi"(%19, %20) : (index, index) -> index
        %22 = "arith.muli"(%15, %2) : (index, index) -> index
        %23 = "arith.addi"(%21, %22) : (index, index) -> index
        %24 = "arith.addi"(%23, %17) : (index, index) -> index
        %25 = "arith.cmpi"(%9, %4) {predicate = 2 : i64} : (index, index) -> i1
        %26 = "arith.subi"(%0, %9) : (index, index) -> index
        %27 = "arith.select"(%25, %26, %9) : (i1, index, index) -> index
        %28 = "arith.divsi"(%27, %1) : (index, index) -> index
        %29 = "arith.subi"(%0, %28) : (index, index) -> index
        %30 = "arith.select"(%25, %29, %28) : (i1, index, index) -> index
        %31 = "arith.addi"(%24, %30) : (index, index) -> index
        %32 = "memref.load"(%12, %31) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32
        %33 = "arith.muli"(%16, %6) : (index, index) -> index
        %34 = "arith.muli"(%18, %6) : (index, index) -> index
        %35 = "arith.addi"(%33, %34) : (index, index) -> index
        %36 = "arith.addi"(%35, %22) : (index, index) -> index
        %37 = "arith.addi"(%36, %17) : (index, index) -> index
        %38 = "arith.cmpi"(%10, %4) {predicate = 2 : i64} : (index, index) -> i1
        %39 = "arith.subi"(%0, %10) : (index, index) -> index
        %40 = "arith.select"(%38, %39, %10) : (i1, index, index) -> index
        %41 = "arith.divsi"(%40, %1) : (index, index) -> index
        %42 = "arith.subi"(%0, %41) : (index, index) -> index
        %43 = "arith.select"(%38, %42, %41) : (i1, index, index) -> index
        %44 = "arith.addi"(%37, %43) : (index, index) -> index
        "memref.store"(%32, %14, %44) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> ()
        "func.return"() : () -> ()
      }) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_3"} : () -> ()
    }) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> ()
    "hal.executable.variant_end"() : () -> ()
  }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> ()
  "hal.executable_end"() : () -> ()
 }) {sym_name = "forward_dispatch_3", sym_visibility = "private"} : () -> ()
    %inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32>
                          ^
	/home/prashant/stable.mlir:793:11: error: failed to materialize conversion for result #0 of operation 'hal.interface.constant.load' that remained live after conversion
	%10 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x160xf32>) outs(%8 : tensor<2x160xf32>) {
	^
	/home/prashant/stable.mlir:24:3: note: called from
	func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> {
	^
	/home/prashant/stable.mlir:793:11: note: see current operation: %20 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32
	%10 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x160xf32>) outs(%8 : tensor<2x160xf32>) {
	^
	/home/prashant/stable.mlir:793:11: note: see existing live user here: %28 = "spirv.UConvert"(%20) : (i32) -> i64
	/home/prashant/stable.mlir:804:27: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>
	%inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32>
	^
	/home/prashant/stable.mlir:24:3: note: called from
	func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> {
	^
	/home/prashant/stable.mlir:804:27: note: see current operation:
	"hal.executable.variant"() ({
	"hal.executable.export"() ({
	^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
	%0 = "arith.constant"() {value = 5 : index} : () -> index
	%1 = "arith.constant"() {value = 2 : index} : () -> index
	%2 = "arith.constant"() {value = 1 : index} : () -> index
	"hal.return"(%0, %1, %2) : (index, index, index) -> ()
	}) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_3", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> ()
	"builtin.module"() ({
	"spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> ()
	"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> ()
	"func.func"() ({
	%0 = "arith.constant"() {value = -1 : index} : () -> index
	%1 = "arith.constant"() {value = 4 : index} : () -> index
	%2 = "arith.constant"() {value = 32 : index} : () -> index
	%3 = "arith.constant"() {value = 160 : index} : () -> index
	%4 = "arith.constant"() {value = 0 : index} : () -> index
	%5 = "arith.constant"() {value = 640 : index} : () -> index
	%6 = "arith.constant"() {value = 320 : index} : () -> index
	%7 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32
	%8 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32
	%9 = "arith.index_castui"(%7) : (i32) -> index
	%10 = "arith.index_castui"(%8) : (i32) -> index
	%11 = "hal.interface.binding.subspan"(%9, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
	%12 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
	%13 = "hal.interface.binding.subspan"(%10, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
	%14 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
	%15 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
	%16 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
	%17 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index
	%18 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index
	%19 = "arith.muli"(%16, %3) : (index, index) -> index
	%20 = "arith.muli"(%18, %3) : (index, index) -> index
	%21 = "arith.addi"(%19, %20) : (index, index) -> index
	%22 = "arith.muli"(%15, %2) : (index, index) -> index
	%23 = "arith.addi"(%21, %22) : (index, index) -> index
	%24 = "arith.addi"(%23, %17) : (index, index) -> index
	%25 = "arith.cmpi"(%9, %4) {predicate = 2 : i64} : (index, index) -> i1
	%26 = "arith.subi"(%0, %9) : (index, index) -> index
	%27 = "arith.select"(%25, %26, %9) : (i1, index, index) -> index
	%28 = "arith.divsi"(%27, %1) : (index, index) -> index
	%29 = "arith.subi"(%0, %28) : (index, index) -> index
	%30 = "arith.select"(%25, %29, %28) : (i1, index, index) -> index
	%31 = "arith.addi"(%24, %30) : (index, index) -> index
	%32 = "memref.load"(%12, %31) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32
	%33 = "arith.muli"(%16, %6) : (index, index) -> index
	%34 = "arith.muli"(%18, %6) : (index, index) -> index
	%35 = "arith.addi"(%33, %34) : (index, index) -> index
	%36 = "arith.addi"(%35, %22) : (index, index) -> index
	%37 = "arith.addi"(%36, %17) : (index, index) -> index
	%38 = "arith.cmpi"(%10, %4) {predicate = 2 : i64} : (index, index) -> i1
	%39 = "arith.subi"(%0, %10) : (index, index) -> index
	%40 = "arith.select"(%38, %39, %10) : (i1, index, index) -> index
	%41 = "arith.divsi"(%40, %1) : (index, index) -> index
	%42 = "arith.subi"(%0, %41) : (index, index) -> index
	%43 = "arith.select"(%38, %42, %41) : (i1, index, index) -> index
	%44 = "arith.addi"(%37, %43) : (index, index) -> index
	"memref.store"(%32, %14, %44) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> ()
	"func.return"() : () -> ()
	}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_3"} : () -> ()
	}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> ()
	"hal.executable.variant_end"() : () -> ()
	}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> ()
	%inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32>
	^
	/home/prashant/stable.mlir:804:27: error: failed to serialize executables
	%inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32>
	^
	/home/prashant/stable.mlir:24:3: note: called from
	func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> {
	^
	/home/prashant/stable.mlir:804:27: note: see current operation:
	"hal.executable"() ({
	"hal.executable.variant"() ({
	"hal.executable.export"() ({
	^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
	%0 = "arith.constant"() {value = 5 : index} : () -> index
	%1 = "arith.constant"() {value = 2 : index} : () -> index
	%2 = "arith.constant"() {value = 1 : index} : () -> index
	"hal.return"(%0, %1, %2) : (index, index, index) -> ()
	}) {layout = #hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_3", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> ()
	"builtin.module"() ({
	"spirv.GlobalVariable"() {binding = 0 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_0_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> ()
	"spirv.GlobalVariable"() {binding = 1 : i32, descriptor_set = 0 : i32, sym_name = "__resource_var_0_1_", type = !spirv.ptr<!spirv.struct<(!spirv.rtarray<f32, stride=4> [0])>, StorageBuffer>} : () -> ()
	"func.func"() ({
	%0 = "arith.constant"() {value = -1 : index} : () -> index
	%1 = "arith.constant"() {value = 4 : index} : () -> index
	%2 = "arith.constant"() {value = 32 : index} : () -> index
	%3 = "arith.constant"() {value = 160 : index} : () -> index
	%4 = "arith.constant"() {value = 0 : index} : () -> index
	%5 = "arith.constant"() {value = 640 : index} : () -> index
	%6 = "arith.constant"() {value = 320 : index} : () -> index
	%7 = "hal.interface.constant.load"() {index = 0 : index} : () -> i32
	%8 = "hal.interface.constant.load"() {index = 1 : index} : () -> i32
	%9 = "arith.index_castui"(%7) : (i32) -> index
	%10 = "arith.index_castui"(%8) : (i32) -> index
	%11 = "hal.interface.binding.subspan"(%9, %6) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
	%12 = "hal.interface.binding.subspan"(%4, %6) {alignment = 64 : index, binding = 0 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
	%13 = "hal.interface.binding.subspan"(%10, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
	%14 = "hal.interface.binding.subspan"(%4, %5) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
	%15 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
	%16 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
	%17 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index
	%18 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index
	%19 = "arith.muli"(%16, %3) : (index, index) -> index
	%20 = "arith.muli"(%18, %3) : (index, index) -> index
	%21 = "arith.addi"(%19, %20) : (index, index) -> index
	%22 = "arith.muli"(%15, %2) : (index, index) -> index
	%23 = "arith.addi"(%21, %22) : (index, index) -> index
	%24 = "arith.addi"(%23, %17) : (index, index) -> index
	%25 = "arith.cmpi"(%9, %4) {predicate = 2 : i64} : (index, index) -> i1
	%26 = "arith.subi"(%0, %9) : (index, index) -> index
	%27 = "arith.select"(%25, %26, %9) : (i1, index, index) -> index
	%28 = "arith.divsi"(%27, %1) : (index, index) -> index
	%29 = "arith.subi"(%0, %28) : (index, index) -> index
	%30 = "arith.select"(%25, %29, %28) : (i1, index, index) -> index
	%31 = "arith.addi"(%24, %30) : (index, index) -> index
	%32 = "memref.load"(%12, %31) : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32
	%33 = "arith.muli"(%16, %6) : (index, index) -> index
	%34 = "arith.muli"(%18, %6) : (index, index) -> index
	%35 = "arith.addi"(%33, %34) : (index, index) -> index
	%36 = "arith.addi"(%35, %22) : (index, index) -> index
	%37 = "arith.addi"(%36, %17) : (index, index) -> index
	%38 = "arith.cmpi"(%10, %4) {predicate = 2 : i64} : (index, index) -> i1
	%39 = "arith.subi"(%0, %10) : (index, index) -> index
	%40 = "arith.select"(%38, %39, %10) : (i1, index, index) -> index
	%41 = "arith.divsi"(%40, %1) : (index, index) -> index
	%42 = "arith.subi"(%0, %41) : (index, index) -> index
	%43 = "arith.select"(%38, %42, %41) : (i1, index, index) -> index
	%44 = "arith.addi"(%37, %43) : (index, index) -> index
	"memref.store"(%32, %14, %44) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> ()
	"func.return"() : () -> ()
	}) {function_type = () -> (), spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [32, 1, 1]>, sym_name = "forward_dispatch_3"} : () -> ()
	}) {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>} : () -> ()
	"hal.executable.variant_end"() : () -> ()
	}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> ()
	"hal.executable_end"() : () -> ()
	}) {sym_name = "forward_dispatch_3", sym_visibility = "private"} : () -> ()
	%inserted_slice_724 = tensor.insert_slice %10 into %12[0, 0] [2, 160] [1, 1] : tensor<2x160xf32> into tensor<2x320xf32>
	^