pashu123 · February 9, 2023 04:54
diff --git a/err_dump.txt b/err_dump.txt
 /home/prashant/stable.mlir:765:15: error: failed to legalize operation 'arith.sitofp' that was explicitly marked illegal
      %3362 = arith.sitofp %3361 : i64 to f32
              ^
 /home/prashant/stable.mlir:24:3: note: called from
  func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> {
  ^
 /home/prashant/stable.mlir:765:15: note: see current operation: %42 = "arith.sitofp"(%41) : (i64) -> f32
      %3362 = arith.sitofp %3361 : i64 to f32
              ^
 /home/prashant/stable.mlir:788:10: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}>
    %9 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%expanded, %expanded_723 : tensor<2x1xf32>, tensor<1x160xf32>) outs(%8 : tensor<2x160xf32>) {
         ^
 /home/prashant/stable.mlir:24:3: note: called from
  func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> {
  ^
 /home/prashant/stable.mlir:788:10: note: see current operation: 
 "hal.executable.variant"() ({
  "hal.executable.export"() ({
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
    %0 = "arith.constant"() {value = 5 : index} : () -> index
    %1 = "arith.constant"() {value = 1 : index} : () -> index
    "hal.return"(%0, %1, %1) : (index, index, index) -> ()
  }) {layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_0_generic_2x160", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 2 : index, 1 : index]} : () -> ()
  "builtin.module"() ({
    "func.func"() ({
      %0 = "arith.constant"() {value = 150560 : index} : () -> index
      %1 = "arith.constant"() {value = 160 : index} : () -> index
      %2 = "arith.constant"() {value = 32 : index} : () -> index
      %3 = "arith.constant"() {value = 0 : index} : () -> index
      %4 = "arith.constant"() {value = 320 : index} : () -> index
      %5 = "arith.constant"() {value = 0.693147182 : f32} : () -> f32
      %6 = "arith.constant"() {value = 1.44269502 : f32} : () -> f32
      %7 = "arith.constant"() {value = 1.000000e+00 : f32} : () -> f32
      %8 = "arith.constant"() {value = 0.499705136 : f32} : () -> f32
      %9 = "arith.constant"() {value = 0.168738902 : f32} : () -> f32
      %10 = "arith.constant"() {value = 0.0366896503 : f32} : () -> f32
      %11 = "arith.constant"() {value = 1.314350e-02 : f32} : () -> f32
      %12 = "arith.constant"() {value = 23 : i32} : () -> i32
      %13 = "arith.constant"() {value = 127 : i32} : () -> i32
      %14 = "arith.constant"() {value = 0.000000e+00 : f32} : () -> f32
      %15 = "arith.constant"() {value = 0x7F800000 : f32} : () -> f32
      %16 = "arith.constant"() {value = 0xFF800000 : f32} : () -> f32
      %17 = "arith.constant"() {value = 1.17549435E-38 : f32} : () -> f32
      %18 = "arith.constant"() {value = -127 : i32} : () -> i32
      %19 = "arith.constant"() {value = 602240 : index} : () -> index
      %20 = "arith.constant"() {value = -9.21033954 : f32} : () -> f32
      %21 = "arith.constant"() {value = 1.600000e+02 : f32} : () -> f32
      %22 = "arith.constant"() {value = 1 : index} : () -> index
      %23 = "hal.interface.binding.subspan"(%3, %22) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
      %24 = "hal.interface.binding.subspan"(%19, %4) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
      %25 = "hal.interface.binding.subspan"(%3, %4) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
      %26 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
      %27 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
      %28 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index
      %29 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index
      %30 = "memref.load"(%23, %3) {nontemporal = false} : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32
      %31 = "arith.muli"(%26, %2) : (index, index) -> index
      %32 = "arith.addi"(%31, %28) : (index, index) -> index
      %33 = "arith.index_cast"(%32) : (index) -> i64
      %34 = "arith.sitofp"(%33) : (i64) -> f32
      %35 = "arith.addf"(%34, %14) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
      %36 = "arith.mulf"(%35, %20) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
      %37 = "arith.divf"(%36, %21) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
      %38 = "arith.cmpf"(%37, %37) {predicate = 14 : i64} : (f32, f32) -> i1
      %39 = "arith.mulf"(%37, %6) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
      %40 = "math.floor"(%39) {fastmath = #arith.fastmath<none>} : (f32) -> f32
      %41 = "arith.mulf"(%40, %5) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
      %42 = "arith.subf"(%37, %41) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
      %43 = "arith.mulf"(%42, %42) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
      %44 = "arith.mulf"(%43, %43) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
      %45 = "math.fma"(%7, %42, %7) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
      %46 = "math.fma"(%9, %42, %8) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
      %47 = "math.fma"(%11, %42, %10) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
      %48 = "math.fma"(%46, %43, %45) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
      %49 = "math.fma"(%47, %44, %48) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
      %50 = "arith.fptosi"(%40) : (f32) -> i32
      %51 = "arith.addi"(%50, %13) : (i32, i32) -> i32
      %52 = "arith.shli"(%51, %12) : (i32, i32) -> i32
      %53 = "arith.bitcast"(%52) : (i32) -> f32
      %54 = "arith.mulf"(%49, %53) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
      %55 = "arith.cmpi"(%50, %13) {predicate = 3 : i64} : (i32, i32) -> i1
      %56 = "arith.cmpi"(%50, %18) {predicate = 5 : i64} : (i32, i32) -> i1
      %57 = "arith.cmpf"(%37, %16) {predicate = 1 : i64} : (f32, f32) -> i1
      %58 = "arith.cmpf"(%37, %15) {predicate = 1 : i64} : (f32, f32) -> i1
      %59 = "arith.cmpf"(%37, %14) {predicate = 2 : i64} : (f32, f32) -> i1
      %60 = "arith.andi"(%55, %56) : (i1, i1) -> i1
      %61 = "arith.select"(%59, %15, %17) : (i1, f32, f32) -> f32
      %62 = "arith.select"(%60, %54, %61) : (i1, f32, f32) -> f32
      %63 = "arith.select"(%58, %15, %62) : (i1, f32, f32) -> f32
      %64 = "arith.select"(%57, %14, %63) : (i1, f32, f32) -> f32
      %65 = "arith.select"(%38, %37, %64) : (i1, f32, f32) -> f32
      %66 = "arith.mulf"(%30, %65) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
      %67 = "arith.muli"(%27, %4) : (index, index) -> index
      %68 = "arith.muli"(%29, %1) : (index, index) -> index
      %69 = "arith.addi"(%67, %68) : (index, index) -> index
      %70 = "arith.addi"(%69, %31) : (index, index) -> index
      %71 = "arith.addi"(%70, %28) : (index, index) -> index
      %72 = "arith.addi"(%71, %0) : (index, index) -> index
      "memref.store"(%66, %25, %72) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> ()
      "func.return"() : () -> ()
    }) {function_type = () -> (), sym_name = "forward_dispatch_0_generic_2x160"} : () -> ()
  }) : () -> ()
  "hal.executable.variant_end"() : () -> ()
 }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}>} : () -> ()
    %9 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%expanded, %expanded_723 : tensor<2x1xf32>, tensor<1x160xf32>) outs(%8 : tensor<2x160xf32>) {
         ^
 /home/prashant/stable.mlir:788:10: error: failed to serialize executables
    %9 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%expanded, %expanded_723 : tensor<2x1xf32>, tensor<1x160xf32>) outs(%8 : tensor<2x160xf32>) {
         ^
 /home/prashant/stable.mlir:24:3: note: called from
  func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> {
  ^
 /home/prashant/stable.mlir:788:10: note: see current operation: 
 "hal.executable"() ({
  "hal.executable.variant"() ({
    "hal.executable.export"() ({
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
      %0 = "arith.constant"() {value = 5 : index} : () -> index
      %1 = "arith.constant"() {value = 1 : index} : () -> index
      "hal.return"(%0, %1, %1) : (index, index, index) -> ()
    }) {layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_0_generic_2x160", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 2 : index, 1 : index]} : () -> ()
    "builtin.module"() ({
      "func.func"() ({
        %0 = "arith.constant"() {value = 150560 : index} : () -> index
        %1 = "arith.constant"() {value = 160 : index} : () -> index
        %2 = "arith.constant"() {value = 32 : index} : () -> index
        %3 = "arith.constant"() {value = 0 : index} : () -> index
        %4 = "arith.constant"() {value = 320 : index} : () -> index
        %5 = "arith.constant"() {value = 0.693147182 : f32} : () -> f32
        %6 = "arith.constant"() {value = 1.44269502 : f32} : () -> f32
        %7 = "arith.constant"() {value = 1.000000e+00 : f32} : () -> f32
        %8 = "arith.constant"() {value = 0.499705136 : f32} : () -> f32
        %9 = "arith.constant"() {value = 0.168738902 : f32} : () -> f32
        %10 = "arith.constant"() {value = 0.0366896503 : f32} : () -> f32
        %11 = "arith.constant"() {value = 1.314350e-02 : f32} : () -> f32
        %12 = "arith.constant"() {value = 23 : i32} : () -> i32
        %13 = "arith.constant"() {value = 127 : i32} : () -> i32
        %14 = "arith.constant"() {value = 0.000000e+00 : f32} : () -> f32
        %15 = "arith.constant"() {value = 0x7F800000 : f32} : () -> f32
        %16 = "arith.constant"() {value = 0xFF800000 : f32} : () -> f32
        %17 = "arith.constant"() {value = 1.17549435E-38 : f32} : () -> f32
        %18 = "arith.constant"() {value = -127 : i32} : () -> i32
        %19 = "arith.constant"() {value = 602240 : index} : () -> index
        %20 = "arith.constant"() {value = -9.21033954 : f32} : () -> f32
        %21 = "arith.constant"() {value = 1.600000e+02 : f32} : () -> f32
        %22 = "arith.constant"() {value = 1 : index} : () -> index
        %23 = "hal.interface.binding.subspan"(%3, %22) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
        %24 = "hal.interface.binding.subspan"(%19, %4) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
        %25 = "hal.interface.binding.subspan"(%3, %4) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
        %26 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
        %27 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
        %28 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index
        %29 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index
        %30 = "memref.load"(%23, %3) {nontemporal = false} : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32
        %31 = "arith.muli"(%26, %2) : (index, index) -> index
        %32 = "arith.addi"(%31, %28) : (index, index) -> index
        %33 = "arith.index_cast"(%32) : (index) -> i64
        %34 = "arith.sitofp"(%33) : (i64) -> f32
        %35 = "arith.addf"(%34, %14) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
        %36 = "arith.mulf"(%35, %20) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
        %37 = "arith.divf"(%36, %21) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
        %38 = "arith.cmpf"(%37, %37) {predicate = 14 : i64} : (f32, f32) -> i1
        %39 = "arith.mulf"(%37, %6) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
        %40 = "math.floor"(%39) {fastmath = #arith.fastmath<none>} : (f32) -> f32
        %41 = "arith.mulf"(%40, %5) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
        %42 = "arith.subf"(%37, %41) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
        %43 = "arith.mulf"(%42, %42) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
        %44 = "arith.mulf"(%43, %43) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
        %45 = "math.fma"(%7, %42, %7) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
        %46 = "math.fma"(%9, %42, %8) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
        %47 = "math.fma"(%11, %42, %10) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
        %48 = "math.fma"(%46, %43, %45) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
        %49 = "math.fma"(%47, %44, %48) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
        %50 = "arith.fptosi"(%40) : (f32) -> i32
        %51 = "arith.addi"(%50, %13) : (i32, i32) -> i32
        %52 = "arith.shli"(%51, %12) : (i32, i32) -> i32
        %53 = "arith.bitcast"(%52) : (i32) -> f32
        %54 = "arith.mulf"(%49, %53) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
        %55 = "arith.cmpi"(%50, %13) {predicate = 3 : i64} : (i32, i32) -> i1
        %56 = "arith.cmpi"(%50, %18) {predicate = 5 : i64} : (i32, i32) -> i1
        %57 = "arith.cmpf"(%37, %16) {predicate = 1 : i64} : (f32, f32) -> i1
        %58 = "arith.cmpf"(%37, %15) {predicate = 1 : i64} : (f32, f32) -> i1
        %59 = "arith.cmpf"(%37, %14) {predicate = 2 : i64} : (f32, f32) -> i1
        %60 = "arith.andi"(%55, %56) : (i1, i1) -> i1
        %61 = "arith.select"(%59, %15, %17) : (i1, f32, f32) -> f32
        %62 = "arith.select"(%60, %54, %61) : (i1, f32, f32) -> f32
        %63 = "arith.select"(%58, %15, %62) : (i1, f32, f32) -> f32
        %64 = "arith.select"(%57, %14, %63) : (i1, f32, f32) -> f32
        %65 = "arith.select"(%38, %37, %64) : (i1, f32, f32) -> f32
        %66 = "arith.mulf"(%30, %65) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
        %67 = "arith.muli"(%27, %4) : (index, index) -> index
        %68 = "arith.muli"(%29, %1) : (index, index) -> index
        %69 = "arith.addi"(%67, %68) : (index, index) -> index
        %70 = "arith.addi"(%69, %31) : (index, index) -> index
        %71 = "arith.addi"(%70, %28) : (index, index) -> index
        %72 = "arith.addi"(%71, %0) : (index, index) -> index
        "memref.store"(%66, %25, %72) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> ()
        "func.return"() : () -> ()
      }) {function_type = () -> (), sym_name = "forward_dispatch_0_generic_2x160"} : () -> ()
    }) : () -> ()
    "hal.executable.variant_end"() : () -> ()
  }) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}>} : () -> ()
  "hal.executable_end"() : () -> ()
 }) {sym_name = "forward_dispatch_0", sym_visibility = "private"} : () -> ()
    %9 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%expanded, %expanded_723 : tensor<2x1xf32>, tensor<1x160xf32>) outs(%8 : tensor<2x160xf32>) {
         ^
	/home/prashant/stable.mlir:765:15: error: failed to legalize operation 'arith.sitofp' that was explicitly marked illegal
	%3362 = arith.sitofp %3361 : i64 to f32
	^
	/home/prashant/stable.mlir:24:3: note: called from
	func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> {
	^
	/home/prashant/stable.mlir:765:15: note: see current operation: %42 = "arith.sitofp"(%41) : (i64) -> f32
	%3362 = arith.sitofp %3361 : i64 to f32
	^
	/home/prashant/stable.mlir:788:10: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}>
	%9 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%expanded, %expanded_723 : tensor<2x1xf32>, tensor<1x160xf32>) outs(%8 : tensor<2x160xf32>) {
	^
	/home/prashant/stable.mlir:24:3: note: called from
	func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> {
	^
	/home/prashant/stable.mlir:788:10: note: see current operation:
	"hal.executable.variant"() ({
	"hal.executable.export"() ({
	^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
	%0 = "arith.constant"() {value = 5 : index} : () -> index
	%1 = "arith.constant"() {value = 1 : index} : () -> index
	"hal.return"(%0, %1, %1) : (index, index, index) -> ()
	}) {layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_0_generic_2x160", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 2 : index, 1 : index]} : () -> ()
	"builtin.module"() ({
	"func.func"() ({
	%0 = "arith.constant"() {value = 150560 : index} : () -> index
	%1 = "arith.constant"() {value = 160 : index} : () -> index
	%2 = "arith.constant"() {value = 32 : index} : () -> index
	%3 = "arith.constant"() {value = 0 : index} : () -> index
	%4 = "arith.constant"() {value = 320 : index} : () -> index
	%5 = "arith.constant"() {value = 0.693147182 : f32} : () -> f32
	%6 = "arith.constant"() {value = 1.44269502 : f32} : () -> f32
	%7 = "arith.constant"() {value = 1.000000e+00 : f32} : () -> f32
	%8 = "arith.constant"() {value = 0.499705136 : f32} : () -> f32
	%9 = "arith.constant"() {value = 0.168738902 : f32} : () -> f32
	%10 = "arith.constant"() {value = 0.0366896503 : f32} : () -> f32
	%11 = "arith.constant"() {value = 1.314350e-02 : f32} : () -> f32
	%12 = "arith.constant"() {value = 23 : i32} : () -> i32
	%13 = "arith.constant"() {value = 127 : i32} : () -> i32
	%14 = "arith.constant"() {value = 0.000000e+00 : f32} : () -> f32
	%15 = "arith.constant"() {value = 0x7F800000 : f32} : () -> f32
	%16 = "arith.constant"() {value = 0xFF800000 : f32} : () -> f32
	%17 = "arith.constant"() {value = 1.17549435E-38 : f32} : () -> f32
	%18 = "arith.constant"() {value = -127 : i32} : () -> i32
	%19 = "arith.constant"() {value = 602240 : index} : () -> index
	%20 = "arith.constant"() {value = -9.21033954 : f32} : () -> f32
	%21 = "arith.constant"() {value = 1.600000e+02 : f32} : () -> f32
	%22 = "arith.constant"() {value = 1 : index} : () -> index
	%23 = "hal.interface.binding.subspan"(%3, %22) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
	%24 = "hal.interface.binding.subspan"(%19, %4) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
	%25 = "hal.interface.binding.subspan"(%3, %4) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
	%26 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
	%27 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
	%28 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index
	%29 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index
	%30 = "memref.load"(%23, %3) {nontemporal = false} : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32
	%31 = "arith.muli"(%26, %2) : (index, index) -> index
	%32 = "arith.addi"(%31, %28) : (index, index) -> index
	%33 = "arith.index_cast"(%32) : (index) -> i64
	%34 = "arith.sitofp"(%33) : (i64) -> f32
	%35 = "arith.addf"(%34, %14) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%36 = "arith.mulf"(%35, %20) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%37 = "arith.divf"(%36, %21) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%38 = "arith.cmpf"(%37, %37) {predicate = 14 : i64} : (f32, f32) -> i1
	%39 = "arith.mulf"(%37, %6) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%40 = "math.floor"(%39) {fastmath = #arith.fastmath<none>} : (f32) -> f32
	%41 = "arith.mulf"(%40, %5) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%42 = "arith.subf"(%37, %41) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%43 = "arith.mulf"(%42, %42) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%44 = "arith.mulf"(%43, %43) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%45 = "math.fma"(%7, %42, %7) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
	%46 = "math.fma"(%9, %42, %8) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
	%47 = "math.fma"(%11, %42, %10) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
	%48 = "math.fma"(%46, %43, %45) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
	%49 = "math.fma"(%47, %44, %48) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
	%50 = "arith.fptosi"(%40) : (f32) -> i32
	%51 = "arith.addi"(%50, %13) : (i32, i32) -> i32
	%52 = "arith.shli"(%51, %12) : (i32, i32) -> i32
	%53 = "arith.bitcast"(%52) : (i32) -> f32
	%54 = "arith.mulf"(%49, %53) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%55 = "arith.cmpi"(%50, %13) {predicate = 3 : i64} : (i32, i32) -> i1
	%56 = "arith.cmpi"(%50, %18) {predicate = 5 : i64} : (i32, i32) -> i1
	%57 = "arith.cmpf"(%37, %16) {predicate = 1 : i64} : (f32, f32) -> i1
	%58 = "arith.cmpf"(%37, %15) {predicate = 1 : i64} : (f32, f32) -> i1
	%59 = "arith.cmpf"(%37, %14) {predicate = 2 : i64} : (f32, f32) -> i1
	%60 = "arith.andi"(%55, %56) : (i1, i1) -> i1
	%61 = "arith.select"(%59, %15, %17) : (i1, f32, f32) -> f32
	%62 = "arith.select"(%60, %54, %61) : (i1, f32, f32) -> f32
	%63 = "arith.select"(%58, %15, %62) : (i1, f32, f32) -> f32
	%64 = "arith.select"(%57, %14, %63) : (i1, f32, f32) -> f32
	%65 = "arith.select"(%38, %37, %64) : (i1, f32, f32) -> f32
	%66 = "arith.mulf"(%30, %65) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%67 = "arith.muli"(%27, %4) : (index, index) -> index
	%68 = "arith.muli"(%29, %1) : (index, index) -> index
	%69 = "arith.addi"(%67, %68) : (index, index) -> index
	%70 = "arith.addi"(%69, %31) : (index, index) -> index
	%71 = "arith.addi"(%70, %28) : (index, index) -> index
	%72 = "arith.addi"(%71, %0) : (index, index) -> index
	"memref.store"(%66, %25, %72) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> ()
	"func.return"() : () -> ()
	}) {function_type = () -> (), sym_name = "forward_dispatch_0_generic_2x160"} : () -> ()
	}) : () -> ()
	"hal.executable.variant_end"() : () -> ()
	}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}>} : () -> ()
	%9 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%expanded, %expanded_723 : tensor<2x1xf32>, tensor<1x160xf32>) outs(%8 : tensor<2x160xf32>) {
	^
	/home/prashant/stable.mlir:788:10: error: failed to serialize executables
	%9 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%expanded, %expanded_723 : tensor<2x1xf32>, tensor<1x160xf32>) outs(%8 : tensor<2x160xf32>) {
	^
	/home/prashant/stable.mlir:24:3: note: called from
	func.func @forward(%arg0: tensor<1x4x96x96xf32>, %arg1: tensor<1xf32>, %arg2: tensor<2x64x1024xf32>, %arg3: tensor<f32>) -> tensor<1x4x96x96xf32> {
	^
	/home/prashant/stable.mlir:788:10: note: see current operation:
	"hal.executable"() ({
	"hal.executable.variant"() ({
	"hal.executable.export"() ({
	^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
	%0 = "arith.constant"() {value = 5 : index} : () -> index
	%1 = "arith.constant"() {value = 1 : index} : () -> index
	"hal.return"(%0, %1, %1) : (index, index, index) -> ()
	}) {layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_0_generic_2x160", translation_info = #iree_codegen.translation_info<SPIRVBaseDistribute>, workgroup_size = [32 : index, 2 : index, 1 : index]} : () -> ()
	"builtin.module"() ({
	"func.func"() ({
	%0 = "arith.constant"() {value = 150560 : index} : () -> index
	%1 = "arith.constant"() {value = 160 : index} : () -> index
	%2 = "arith.constant"() {value = 32 : index} : () -> index
	%3 = "arith.constant"() {value = 0 : index} : () -> index
	%4 = "arith.constant"() {value = 320 : index} : () -> index
	%5 = "arith.constant"() {value = 0.693147182 : f32} : () -> f32
	%6 = "arith.constant"() {value = 1.44269502 : f32} : () -> f32
	%7 = "arith.constant"() {value = 1.000000e+00 : f32} : () -> f32
	%8 = "arith.constant"() {value = 0.499705136 : f32} : () -> f32
	%9 = "arith.constant"() {value = 0.168738902 : f32} : () -> f32
	%10 = "arith.constant"() {value = 0.0366896503 : f32} : () -> f32
	%11 = "arith.constant"() {value = 1.314350e-02 : f32} : () -> f32
	%12 = "arith.constant"() {value = 23 : i32} : () -> i32
	%13 = "arith.constant"() {value = 127 : i32} : () -> i32
	%14 = "arith.constant"() {value = 0.000000e+00 : f32} : () -> f32
	%15 = "arith.constant"() {value = 0x7F800000 : f32} : () -> f32
	%16 = "arith.constant"() {value = 0xFF800000 : f32} : () -> f32
	%17 = "arith.constant"() {value = 1.17549435E-38 : f32} : () -> f32
	%18 = "arith.constant"() {value = -127 : i32} : () -> i32
	%19 = "arith.constant"() {value = 602240 : index} : () -> index
	%20 = "arith.constant"() {value = -9.21033954 : f32} : () -> f32
	%21 = "arith.constant"() {value = 1.600000e+02 : f32} : () -> f32
	%22 = "arith.constant"() {value = 1 : index} : () -> index
	%23 = "hal.interface.binding.subspan"(%3, %22) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 1 : i32, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
	%24 = "hal.interface.binding.subspan"(%19, %4) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
	%25 = "hal.interface.binding.subspan"(%3, %4) {alignment = 64 : index, binding = 1 : index, descriptor_type = #hal.descriptor_type<storage_buffer>, operand_segment_sizes = array<i32: 1, 1>, set = 0 : index} : (index, index) -> memref<?xf32, #spirv.storage_class<StorageBuffer>>
	%26 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
	%27 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
	%28 = "gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index
	%29 = "gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index
	%30 = "memref.load"(%23, %3) {nontemporal = false} : (memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> f32
	%31 = "arith.muli"(%26, %2) : (index, index) -> index
	%32 = "arith.addi"(%31, %28) : (index, index) -> index
	%33 = "arith.index_cast"(%32) : (index) -> i64
	%34 = "arith.sitofp"(%33) : (i64) -> f32
	%35 = "arith.addf"(%34, %14) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%36 = "arith.mulf"(%35, %20) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%37 = "arith.divf"(%36, %21) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%38 = "arith.cmpf"(%37, %37) {predicate = 14 : i64} : (f32, f32) -> i1
	%39 = "arith.mulf"(%37, %6) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%40 = "math.floor"(%39) {fastmath = #arith.fastmath<none>} : (f32) -> f32
	%41 = "arith.mulf"(%40, %5) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%42 = "arith.subf"(%37, %41) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%43 = "arith.mulf"(%42, %42) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%44 = "arith.mulf"(%43, %43) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%45 = "math.fma"(%7, %42, %7) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
	%46 = "math.fma"(%9, %42, %8) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
	%47 = "math.fma"(%11, %42, %10) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
	%48 = "math.fma"(%46, %43, %45) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
	%49 = "math.fma"(%47, %44, %48) {fastmath = #arith.fastmath<none>} : (f32, f32, f32) -> f32
	%50 = "arith.fptosi"(%40) : (f32) -> i32
	%51 = "arith.addi"(%50, %13) : (i32, i32) -> i32
	%52 = "arith.shli"(%51, %12) : (i32, i32) -> i32
	%53 = "arith.bitcast"(%52) : (i32) -> f32
	%54 = "arith.mulf"(%49, %53) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%55 = "arith.cmpi"(%50, %13) {predicate = 3 : i64} : (i32, i32) -> i1
	%56 = "arith.cmpi"(%50, %18) {predicate = 5 : i64} : (i32, i32) -> i1
	%57 = "arith.cmpf"(%37, %16) {predicate = 1 : i64} : (f32, f32) -> i1
	%58 = "arith.cmpf"(%37, %15) {predicate = 1 : i64} : (f32, f32) -> i1
	%59 = "arith.cmpf"(%37, %14) {predicate = 2 : i64} : (f32, f32) -> i1
	%60 = "arith.andi"(%55, %56) : (i1, i1) -> i1
	%61 = "arith.select"(%59, %15, %17) : (i1, f32, f32) -> f32
	%62 = "arith.select"(%60, %54, %61) : (i1, f32, f32) -> f32
	%63 = "arith.select"(%58, %15, %62) : (i1, f32, f32) -> f32
	%64 = "arith.select"(%57, %14, %63) : (i1, f32, f32) -> f32
	%65 = "arith.select"(%38, %37, %64) : (i1, f32, f32) -> f32
	%66 = "arith.mulf"(%30, %65) {fastmath = #arith.fastmath<none>} : (f32, f32) -> f32
	%67 = "arith.muli"(%27, %4) : (index, index) -> index
	%68 = "arith.muli"(%29, %1) : (index, index) -> index
	%69 = "arith.addi"(%67, %68) : (index, index) -> index
	%70 = "arith.addi"(%69, %31) : (index, index) -> index
	%71 = "arith.addi"(%70, %28) : (index, index) -> index
	%72 = "arith.addi"(%71, %0) : (index, index) -> index
	"memref.store"(%66, %25, %72) : (f32, memref<?xf32, #spirv.storage_class<StorageBuffer>>, index) -> ()
	"func.return"() : () -> ()
	}) {function_type = () -> (), sym_name = "forward_dispatch_0_generic_2x160"} : () -> ()
	}) : () -> ()
	"hal.executable.variant_end"() : () -> ()
	}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64, cooperative_matrix_properties_nv = []>>}>} : () -> ()
	"hal.executable_end"() : () -> ()
	}) {sym_name = "forward_dispatch_0", sym_visibility = "private"} : () -> ()
	%9 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%expanded, %expanded_723 : tensor<2x1xf32>, tensor<1x160xf32>) outs(%8 : tensor<2x160xf32>) {
	^
No results found