AmosLewis · December 19, 2023 01:47
diff --git a/1218_chatglm_forward9-dispatch-tensors-annotation.mlir b/1218_chatglm_forward9-dispatch-tensors-annotation.mlir
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "haswell", cpu_features = "-prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-xsaves,-avx512fp16,-usermsr,-sm4,+sse4.1,-avx512ifma,+xsave,-avx512pf,+sse4.2,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,-xsavec,-avx10.1-512,-avx512vpopcntdq,+cmov,-avx512vp2intersect,-avx512cd,+movbe,-avxvnniint8,-avx512er,-amx-int8,-kl,-avx10.1-256,-sha512,-avxvnni,-rtm,-adx,+avx2,-hreset,-movdiri,-serialize,-vpclmulqdq,-avx512vl,-uintr,-clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-gfni,-avxvnniint16,-amx-fp16,+xsaveopt,+rdrnd,-avx512f,-amx-bf16,-avx512bf16,-avx512vnni,+cx8,-avx512bw,+sse3,-pku,+fsgsbase,-clzero,-mwaitx,-lwp,+lzcnt,-sha,-movdir64b,-wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,-avx512bitalg,-rdpru,-clwb,+mmx,+sse2,-rdseed,-avx512vbmi2,-prefetchi,-rdpid,-fma4,-avx512vbmi,-shstk,-vaes,-waitpkg,-sgx,+fxsr,-avx512dq,-sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "--iree-llvmcpu-stack-allocation-limit=256000"}>
 #map = affine_map<(d0) -> (d0)>
 #map1 = affine_map<(d0, d1) -> (d0)>
 #map2 = affine_map<(d0, d1) -> (d1)>
 #map3 = affine_map<(d0, d1) -> (d0, d1)>
 #map4 = affine_map<() -> ()>
 #map5 = affine_map<(d0, d1) -> ()>
 #device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
 module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.consteval} {
  flow.executable private @jit_eval_dispatch_0 {
    flow.executable.export public @jit_eval_dispatch_0_generic_32_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @jit_eval_dispatch_0_generic_32_f16(%arg0: !flow.dispatch.tensor<writeonly:tensor<32xf16>>) {
        %cst = arith.constant 2.000000e+00 : f16
        %cst_0 = arith.constant 0.000000e+00 : f16
        %cst_1 = arith.constant 6.400000e+01 : f16
        %0 = tensor.empty() : tensor<32xf16>
        %1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%0 : tensor<32xf16>) {
        ^bb0(%out: f16):
          %2 = linalg.index 0 : index
          %3 = arith.index_cast %2 : index to i64
          %4 = arith.sitofp %3 : i64 to f16
          %5 = arith.mulf %4, %cst : f16
          %6 = arith.addf %5, %cst_0 : f16
          %7 = arith.divf %6, %cst_1 : f16
          linalg.yield %7 : f16
        } -> tensor<32xf16>
        flow.dispatch.tensor.store %1, %arg0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32xf16>>
        return
      }
    }
  }
  func.func @jit_eval() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval() -> (%output0: tensor<32xf16>)"}} {
    %0 = flow.dispatch @jit_eval_dispatch_0::@jit_eval_dispatch_0_generic_32_f16() : () -> tensor<32xf16>
    %1 = hal.tensor.export %0 "output0" : tensor<32xf16> -> !hal.buffer_view
    return %1 : !hal.buffer_view
  }
  flow.executable private @jit_eval_0_dispatch_0 {
    flow.executable.export public @jit_eval_0_dispatch_0_generic_32_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @jit_eval_0_dispatch_0_generic_32_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<32xf16>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<32xf16>>) {
        %cst = arith.constant 1.000000e+04 : f16
        %cst_0 = arith.constant 0.000000e+00 : f16
        %cst_1 = arith.constant 1.000000e+00 : f16
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xf16>> -> tensor<32xf16>
        %1 = tensor.empty() : tensor<32xf16>
        %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<32xf16>) outs(%1 : tensor<32xf16>) {
        ^bb0(%in: f16, %out: f16):
          %3 = math.powf %cst, %in : f16
          %4 = arith.cmpf one, %3, %cst_0 : f16
          cf.assert %4, "unimplemented: tensor with zero element"
          %5 = arith.divf %cst_1, %3 : f16
          linalg.yield %5 : f16
        } -> tensor<32xf16>
        flow.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32xf16>>
        return
      }
    }
  }
  func.func @jit_eval_0(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_0(%input0: tensor<32xf16>) -> (%output0: tensor<32xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32xf16>
    %1 = flow.dispatch @jit_eval_0_dispatch_0::@jit_eval_0_dispatch_0_generic_32_f16(%0) : (tensor<32xf16>) -> tensor<32xf16>
    %2 = hal.tensor.export %1 "output0" : tensor<32xf16> -> !hal.buffer_view
    return %2 : !hal.buffer_view
  }
  flow.executable private @jit_eval_1_dispatch_0 {
    flow.executable.export public @jit_eval_1_dispatch_0_generic_32768_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @jit_eval_1_dispatch_0_generic_32768_f16(%arg0: !flow.dispatch.tensor<writeonly:tensor<32768xf16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = tensor.empty() : tensor<32768xf16>
        %1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%0 : tensor<32768xf16>) {
        ^bb0(%out: f16):
          %2 = linalg.index 0 : index
          %3 = arith.index_cast %2 : index to i64
          %4 = arith.sitofp %3 : i64 to f16
          %5 = arith.addf %4, %cst : f16
          linalg.yield %5 : f16
        } -> tensor<32768xf16>
        flow.dispatch.tensor.store %1, %arg0, offsets = [0], sizes = [32768], strides = [1] : tensor<32768xf16> -> !flow.dispatch.tensor<writeonly:tensor<32768xf16>>
        return
      }
    }
  }
  func.func @jit_eval_1() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_1() -> (%output0: tensor<32768xf16>)"}} {
    %0 = flow.dispatch @jit_eval_1_dispatch_0::@jit_eval_1_dispatch_0_generic_32768_f16() : () -> tensor<32768xf16>
    %1 = hal.tensor.export %0 "output0" : tensor<32768xf16> -> !hal.buffer_view
    return %1 : !hal.buffer_view
  }
  flow.executable private @jit_eval_2_dispatch_0 {
    flow.executable.export public @jit_eval_2_dispatch_0_generic_32768x32_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @jit_eval_2_dispatch_0_generic_32768x32_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<32768xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<32xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [32768], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32768xf16>> -> tensor<32768xf16>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xf16>> -> tensor<32xf16>
        %2 = tensor.empty() : tensor<32768x32xf16>
        %3:2 = linalg.generic {indexing_maps = [#map1, #map2, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%0, %1 : tensor<32768xf16>, tensor<32xf16>) outs(%2, %2 : tensor<32768x32xf16>, tensor<32768x32xf16>) {
        ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: f16):
          %4 = arith.mulf %in, %in_0 : f16
          %5 = math.sin %4 : f16
          %6 = math.cos %4 : f16
          linalg.yield %5, %6 : f16, f16
        } -> (tensor<32768x32xf16>, tensor<32768x32xf16>)
        flow.dispatch.tensor.store %3#0, %arg2, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>>
        flow.dispatch.tensor.store %3#1, %arg3, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>>
        return
      }
    }
  }
  flow.executable private @jit_eval_2_dispatch_1 {
    flow.executable.export public @jit_eval_2_dispatch_1_slow_memcpy workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @jit_eval_2_dispatch_1_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<32768x32xf16>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32768x32xf16>> -> tensor<32768x32xf16>
        flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0, 0], sizes = [32768, 32, 1], strides = [1, 1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>>
        return
      }
    }
  }
  flow.executable private @jit_eval_2_dispatch_2 {
    flow.executable.export public @jit_eval_2_dispatch_2_slow_memcpy workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @jit_eval_2_dispatch_2_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<32768x32xf16>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32768x32xf16>> -> tensor<32768x32xf16>
        flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0, 1], sizes = [32768, 32, 1], strides = [1, 1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>>
        return
      }
    }
  }
  func.func @jit_eval_2(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_2(%input0: tensor<32768xf16>, %input1: tensor<32xf16>) -> (%output0: tensor<4x32x2xf16>)"}} {
    %c2 = arith.constant 2 : index
    %c32 = arith.constant 32 : index
    %c4 = arith.constant 4 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32768xf16>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<32xf16>
    %2:2 = flow.dispatch @jit_eval_2_dispatch_0::@jit_eval_2_dispatch_0_generic_32768x32_f16(%0, %1) : (tensor<32768xf16>, tensor<32xf16>) -> (tensor<32768x32xf16>, tensor<32768x32xf16>)
    %3 = flow.tensor.empty : tensor<32768x32x2xf16>
    %4 = flow.dispatch @jit_eval_2_dispatch_1::@jit_eval_2_dispatch_1_slow_memcpy(%2#1, %3) : (tensor<32768x32xf16>, tensor<32768x32x2xf16>) -> %3
    %5 = flow.dispatch @jit_eval_2_dispatch_2::@jit_eval_2_dispatch_2_slow_memcpy(%2#0, %4) : (tensor<32768x32xf16>, tensor<32768x32x2xf16>) -> %4
    %6 = flow.tensor.reshape %5 : tensor<32768x32x2xf16> -> tensor<1x32768x32x2xf16>
    %7 = flow.tensor.slice %6[%c0, %c0, %c0, %c0 for %c1, %c4, %c32, %c2] : tensor<1x32768x32x2xf16> -> tensor<1x4x32x2xf16>
    %8 = flow.tensor.reshape %7 : tensor<1x4x32x2xf16> -> tensor<4x32x2xf16>
    %9 = hal.tensor.export %8 "output0" : tensor<4x32x2xf16> -> !hal.buffer_view
    return %9 : !hal.buffer_view
  }
  flow.executable private @jit_eval_4_dispatch_0 {
    flow.executable.export public @jit_eval_4_dispatch_0_slow_memcpy workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @jit_eval_4_dispatch_0_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<4x32xf16>>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0, 0], sizes = [4, 1, 1, 32, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>> -> tensor<4x32xf16>
        flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0], sizes = [4, 32], strides = [1, 1] : tensor<4x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<4x32xf16>>
        return
      }
    }
  }
  func.func @jit_eval_4(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_4(%input0: tensor<4x32x2xf16>) -> (%output0: tensor<4x32xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<4x32x2xf16>
    %1 = flow.tensor.reshape %0 : tensor<4x32x2xf16> -> tensor<4x1x1x32x2xf16>
    %2 = flow.dispatch @jit_eval_4_dispatch_0::@jit_eval_4_dispatch_0_slow_memcpy(%1) : (tensor<4x1x1x32x2xf16>) -> tensor<4x32xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<4x32xf16> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
  flow.executable private @jit_eval_5_dispatch_0 {
    flow.executable.export public @jit_eval_5_dispatch_0_slow_memcpy workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @jit_eval_5_dispatch_0_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<4x32xf16>>) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0, 1], sizes = [4, 1, 1, 32, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>> -> tensor<4x32xf16>
        flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0], sizes = [4, 32], strides = [1, 1] : tensor<4x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<4x32xf16>>
        return
      }
    }
  }
  func.func @jit_eval_5(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_5(%input0: tensor<4x32x2xf16>) -> (%output0: tensor<4x32xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<4x32x2xf16>
    %1 = flow.tensor.reshape %0 : tensor<4x32x2xf16> -> tensor<4x1x1x32x2xf16>
    %2 = flow.dispatch @jit_eval_5_dispatch_0::@jit_eval_5_dispatch_0_slow_memcpy(%1) : (tensor<4x1x1x32x2xf16>) -> tensor<4x32xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<4x32xf16> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
  flow.executable private @jit_eval_6_dispatch_0 {
    flow.executable.export public @jit_eval_6_dispatch_0_transpose workgroups() -> (index, index, index) {
      %c1 = arith.constant 1 : index
      flow.return %c1, %c1, %c1 : index, index, index
    }
    builtin.module {
      func.func @jit_eval_6_dispatch_0_transpose(%arg0: !flow.dispatch.tensor<writeonly:tensor<f16>>) {
        %cst = arith.constant 0.000000e+00 : f16
        %0 = tensor.empty() : tensor<f16>
        %1 = linalg.generic {indexing_maps = [#map4], iterator_types = []} outs(%0 : tensor<f16>) {
        ^bb0(%out: f16):
          linalg.yield %cst : f16
        } -> tensor<f16>
        flow.dispatch.tensor.store %1, %arg0, offsets = [], sizes = [], strides = [] : tensor<f16> -> !flow.dispatch.tensor<writeonly:tensor<f16>>
        return
      }
    }
  }
  func.func @jit_eval_6() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_6() -> (%output0: tensor<f16>)"}} {
    %0 = flow.dispatch @jit_eval_6_dispatch_0::@jit_eval_6_dispatch_0_transpose() : () -> tensor<f16>
    %1 = hal.tensor.export %0 "output0" : tensor<f16> -> !hal.buffer_view
    return %1 : !hal.buffer_view
  }
  func.func @jit_eval_7() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_7() -> (%output0: tensor<f32>)"}} {
    %cst = arith.constant 0xFF800000 : f32
    %0 = flow.tensor.splat %cst : tensor<f32>
    %1 = hal.tensor.export %0 "output0" : tensor<f32> -> !hal.buffer_view
    return %1 : !hal.buffer_view
  }
  flow.executable private @jit_eval_8_dispatch_0 {
    flow.executable.export public @jit_eval_8_dispatch_0_generic_4x4_f32xf16xf16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @jit_eval_8_dispatch_0_generic_4x4_f32xf16xf16(%arg0: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<f16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<4x4xf16>>) {
        %false = arith.constant false
        %0 = flow.dispatch.tensor.load %arg0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f16>> -> tensor<f16>
        %2 = tensor.empty() : tensor<4x4xf16>
        %3 = linalg.generic {indexing_maps = [#map5, #map5, #map3], iterator_types = ["parallel", "parallel"]} ins(%0, %1 : tensor<f32>, tensor<f16>) outs(%2 : tensor<4x4xf16>) {
        ^bb0(%in: f32, %in_0: f16, %out: f16):
          %4 = arith.truncf %in : f32 to f16
          %5 = linalg.index 0 : index
          %6 = linalg.index 1 : index
          %7 = arith.index_cast %5 : index to i64
          %8 = arith.index_cast %6 : index to i64
          %9 = arith.cmpi sle, %8, %7 : i64
          %10 = arith.cmpi eq, %9, %false : i1
          %11 = arith.select %10, %4, %in_0 : f16
          linalg.yield %11 : f16
        } -> tensor<4x4xf16>
        flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [4, 4], strides = [1, 1] : tensor<4x4xf16> -> !flow.dispatch.tensor<writeonly:tensor<4x4xf16>>
        return
      }
    }
  }
  func.func @jit_eval_8(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_8(%input0: tensor<f32>, %input1: tensor<f16>) -> (%output0: tensor<4x4xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<f32>
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<f16>
    %2 = flow.dispatch @jit_eval_8_dispatch_0::@jit_eval_8_dispatch_0_generic_4x4_f32xf16xf16(%0, %1) : (tensor<f32>, tensor<f16>) -> tensor<4x4xf16>
    %3 = hal.tensor.export %2 "output0" : tensor<4x4xf16> -> !hal.buffer_view
    return %3 : !hal.buffer_view
  }
 }
	#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "haswell", cpu_features = "-prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-xsaves,-avx512fp16,-usermsr,-sm4,+sse4.1,-avx512ifma,+xsave,-avx512pf,+sse4.2,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,-xsavec,-avx10.1-512,-avx512vpopcntdq,+cmov,-avx512vp2intersect,-avx512cd,+movbe,-avxvnniint8,-avx512er,-amx-int8,-kl,-avx10.1-256,-sha512,-avxvnni,-rtm,-adx,+avx2,-hreset,-movdiri,-serialize,-vpclmulqdq,-avx512vl,-uintr,-clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-gfni,-avxvnniint16,-amx-fp16,+xsaveopt,+rdrnd,-avx512f,-amx-bf16,-avx512bf16,-avx512vnni,+cx8,-avx512bw,+sse3,-pku,+fsgsbase,-clzero,-mwaitx,-lwp,+lzcnt,-sha,-movdir64b,-wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,-avx512bitalg,-rdpru,-clwb,+mmx,+sse2,-rdseed,-avx512vbmi2,-prefetchi,-rdpid,-fma4,-avx512vbmi,-shstk,-vaes,-waitpkg,-sgx,+fxsr,-avx512dq,-sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "--iree-llvmcpu-stack-allocation-limit=256000"}>
	#map = affine_map<(d0) -> (d0)>
	#map1 = affine_map<(d0, d1) -> (d0)>
	#map2 = affine_map<(d0, d1) -> (d1)>
	#map3 = affine_map<(d0, d1) -> (d0, d1)>
	#map4 = affine_map<() -> ()>
	#map5 = affine_map<(d0, d1) -> ()>
	#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
	module attributes {hal.device.targets = [#device_target_llvm_cpu], iree.consteval} {
	flow.executable private @jit_eval_dispatch_0 {
	flow.executable.export public @jit_eval_dispatch_0_generic_32_f16 workgroups() -> (index, index, index) {
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	flow.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @jit_eval_dispatch_0_generic_32_f16(%arg0: !flow.dispatch.tensor<writeonly:tensor<32xf16>>) {
	%cst = arith.constant 2.000000e+00 : f16
	%cst_0 = arith.constant 0.000000e+00 : f16
	%cst_1 = arith.constant 6.400000e+01 : f16
	%0 = tensor.empty() : tensor<32xf16>
	%1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%0 : tensor<32xf16>) {
	^bb0(%out: f16):
	%2 = linalg.index 0 : index
	%3 = arith.index_cast %2 : index to i64
	%4 = arith.sitofp %3 : i64 to f16
	%5 = arith.mulf %4, %cst : f16
	%6 = arith.addf %5, %cst_0 : f16
	%7 = arith.divf %6, %cst_1 : f16
	linalg.yield %7 : f16
	} -> tensor<32xf16>
	flow.dispatch.tensor.store %1, %arg0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32xf16>>
	return
	}
	}
	}
	func.func @jit_eval() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval() -> (%output0: tensor<32xf16>)"}} {
	%0 = flow.dispatch @jit_eval_dispatch_0::@jit_eval_dispatch_0_generic_32_f16() : () -> tensor<32xf16>
	%1 = hal.tensor.export %0 "output0" : tensor<32xf16> -> !hal.buffer_view
	return %1 : !hal.buffer_view
	}
	flow.executable private @jit_eval_0_dispatch_0 {
	flow.executable.export public @jit_eval_0_dispatch_0_generic_32_f16 workgroups() -> (index, index, index) {
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	flow.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @jit_eval_0_dispatch_0_generic_32_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<32xf16>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<32xf16>>) {
	%cst = arith.constant 1.000000e+04 : f16
	%cst_0 = arith.constant 0.000000e+00 : f16
	%cst_1 = arith.constant 1.000000e+00 : f16
	%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xf16>> -> tensor<32xf16>
	%1 = tensor.empty() : tensor<32xf16>
	%2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%0 : tensor<32xf16>) outs(%1 : tensor<32xf16>) {
	^bb0(%in: f16, %out: f16):
	%3 = math.powf %cst, %in : f16
	%4 = arith.cmpf one, %3, %cst_0 : f16
	cf.assert %4, "unimplemented: tensor with zero element"
	%5 = arith.divf %cst_1, %3 : f16
	linalg.yield %5 : f16
	} -> tensor<32xf16>
	flow.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32xf16>>
	return
	}
	}
	}
	func.func @jit_eval_0(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_0(%input0: tensor<32xf16>) -> (%output0: tensor<32xf16>)"}} {
	%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32xf16>
	%1 = flow.dispatch @jit_eval_0_dispatch_0::@jit_eval_0_dispatch_0_generic_32_f16(%0) : (tensor<32xf16>) -> tensor<32xf16>
	%2 = hal.tensor.export %1 "output0" : tensor<32xf16> -> !hal.buffer_view
	return %2 : !hal.buffer_view
	}
	flow.executable private @jit_eval_1_dispatch_0 {
	flow.executable.export public @jit_eval_1_dispatch_0_generic_32768_f16 workgroups() -> (index, index, index) {
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	flow.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @jit_eval_1_dispatch_0_generic_32768_f16(%arg0: !flow.dispatch.tensor<writeonly:tensor<32768xf16>>) {
	%cst = arith.constant 0.000000e+00 : f16
	%0 = tensor.empty() : tensor<32768xf16>
	%1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%0 : tensor<32768xf16>) {
	^bb0(%out: f16):
	%2 = linalg.index 0 : index
	%3 = arith.index_cast %2 : index to i64
	%4 = arith.sitofp %3 : i64 to f16
	%5 = arith.addf %4, %cst : f16
	linalg.yield %5 : f16
	} -> tensor<32768xf16>
	flow.dispatch.tensor.store %1, %arg0, offsets = [0], sizes = [32768], strides = [1] : tensor<32768xf16> -> !flow.dispatch.tensor<writeonly:tensor<32768xf16>>
	return
	}
	}
	}
	func.func @jit_eval_1() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_1() -> (%output0: tensor<32768xf16>)"}} {
	%0 = flow.dispatch @jit_eval_1_dispatch_0::@jit_eval_1_dispatch_0_generic_32768_f16() : () -> tensor<32768xf16>
	%1 = hal.tensor.export %0 "output0" : tensor<32768xf16> -> !hal.buffer_view
	return %1 : !hal.buffer_view
	}
	flow.executable private @jit_eval_2_dispatch_0 {
	flow.executable.export public @jit_eval_2_dispatch_0_generic_32768x32_f16 workgroups() -> (index, index, index) {
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	flow.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @jit_eval_2_dispatch_0_generic_32768x32_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<32768xf16>>, %arg1: !flow.dispatch.tensor<readonly:tensor<32xf16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>>, %arg3: !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>>) {
	%0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [32768], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32768xf16>> -> tensor<32768xf16>
	%1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xf16>> -> tensor<32xf16>
	%2 = tensor.empty() : tensor<32768x32xf16>
	%3:2 = linalg.generic {indexing_maps = [#map1, #map2, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%0, %1 : tensor<32768xf16>, tensor<32xf16>) outs(%2, %2 : tensor<32768x32xf16>, tensor<32768x32xf16>) {
	^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: f16):
	%4 = arith.mulf %in, %in_0 : f16
	%5 = math.sin %4 : f16
	%6 = math.cos %4 : f16
	linalg.yield %5, %6 : f16, f16
	} -> (tensor<32768x32xf16>, tensor<32768x32xf16>)
	flow.dispatch.tensor.store %3#0, %arg2, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>>
	flow.dispatch.tensor.store %3#1, %arg3, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>>
	return
	}
	}
	}
	flow.executable private @jit_eval_2_dispatch_1 {
	flow.executable.export public @jit_eval_2_dispatch_1_slow_memcpy workgroups() -> (index, index, index) {
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	flow.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @jit_eval_2_dispatch_1_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<32768x32xf16>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>>) {
	%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32768x32xf16>> -> tensor<32768x32xf16>
	flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0, 0], sizes = [32768, 32, 1], strides = [1, 1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>>
	return
	}
	}
	}
	flow.executable private @jit_eval_2_dispatch_2 {
	flow.executable.export public @jit_eval_2_dispatch_2_slow_memcpy workgroups() -> (index, index, index) {
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	flow.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @jit_eval_2_dispatch_2_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<32768x32xf16>>, %arg1: !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>>) {
	%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32768x32xf16>> -> tensor<32768x32xf16>
	flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0, 1], sizes = [32768, 32, 1], strides = [1, 1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>>
	return
	}
	}
	}
	func.func @jit_eval_2(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_2(%input0: tensor<32768xf16>, %input1: tensor<32xf16>) -> (%output0: tensor<4x32x2xf16>)"}} {
	%c2 = arith.constant 2 : index
	%c32 = arith.constant 32 : index
	%c4 = arith.constant 4 : index
	%c1 = arith.constant 1 : index
	%c0 = arith.constant 0 : index
	%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32768xf16>
	%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<32xf16>
	%2:2 = flow.dispatch @jit_eval_2_dispatch_0::@jit_eval_2_dispatch_0_generic_32768x32_f16(%0, %1) : (tensor<32768xf16>, tensor<32xf16>) -> (tensor<32768x32xf16>, tensor<32768x32xf16>)
	%3 = flow.tensor.empty : tensor<32768x32x2xf16>
	%4 = flow.dispatch @jit_eval_2_dispatch_1::@jit_eval_2_dispatch_1_slow_memcpy(%2#1, %3) : (tensor<32768x32xf16>, tensor<32768x32x2xf16>) -> %3
	%5 = flow.dispatch @jit_eval_2_dispatch_2::@jit_eval_2_dispatch_2_slow_memcpy(%2#0, %4) : (tensor<32768x32xf16>, tensor<32768x32x2xf16>) -> %4
	%6 = flow.tensor.reshape %5 : tensor<32768x32x2xf16> -> tensor<1x32768x32x2xf16>
	%7 = flow.tensor.slice %6[%c0, %c0, %c0, %c0 for %c1, %c4, %c32, %c2] : tensor<1x32768x32x2xf16> -> tensor<1x4x32x2xf16>
	%8 = flow.tensor.reshape %7 : tensor<1x4x32x2xf16> -> tensor<4x32x2xf16>
	%9 = hal.tensor.export %8 "output0" : tensor<4x32x2xf16> -> !hal.buffer_view
	return %9 : !hal.buffer_view
	}
	flow.executable private @jit_eval_4_dispatch_0 {
	flow.executable.export public @jit_eval_4_dispatch_0_slow_memcpy workgroups() -> (index, index, index) {
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	flow.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @jit_eval_4_dispatch_0_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<4x32xf16>>) {
	%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0, 0], sizes = [4, 1, 1, 32, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>> -> tensor<4x32xf16>
	flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0], sizes = [4, 32], strides = [1, 1] : tensor<4x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<4x32xf16>>
	return
	}
	}
	}
	func.func @jit_eval_4(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_4(%input0: tensor<4x32x2xf16>) -> (%output0: tensor<4x32xf16>)"}} {
	%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<4x32x2xf16>
	%1 = flow.tensor.reshape %0 : tensor<4x32x2xf16> -> tensor<4x1x1x32x2xf16>
	%2 = flow.dispatch @jit_eval_4_dispatch_0::@jit_eval_4_dispatch_0_slow_memcpy(%1) : (tensor<4x1x1x32x2xf16>) -> tensor<4x32xf16>
	%3 = hal.tensor.export %2 "output0" : tensor<4x32xf16> -> !hal.buffer_view
	return %3 : !hal.buffer_view
	}
	flow.executable private @jit_eval_5_dispatch_0 {
	flow.executable.export public @jit_eval_5_dispatch_0_slow_memcpy workgroups() -> (index, index, index) {
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	flow.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @jit_eval_5_dispatch_0_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<4x32xf16>>) {
	%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0, 1], sizes = [4, 1, 1, 32, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>> -> tensor<4x32xf16>
	flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0], sizes = [4, 32], strides = [1, 1] : tensor<4x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<4x32xf16>>
	return
	}
	}
	}
	func.func @jit_eval_5(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_5(%input0: tensor<4x32x2xf16>) -> (%output0: tensor<4x32xf16>)"}} {
	%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<4x32x2xf16>
	%1 = flow.tensor.reshape %0 : tensor<4x32x2xf16> -> tensor<4x1x1x32x2xf16>
	%2 = flow.dispatch @jit_eval_5_dispatch_0::@jit_eval_5_dispatch_0_slow_memcpy(%1) : (tensor<4x1x1x32x2xf16>) -> tensor<4x32xf16>
	%3 = hal.tensor.export %2 "output0" : tensor<4x32xf16> -> !hal.buffer_view
	return %3 : !hal.buffer_view
	}
	flow.executable private @jit_eval_6_dispatch_0 {
	flow.executable.export public @jit_eval_6_dispatch_0_transpose workgroups() -> (index, index, index) {
	%c1 = arith.constant 1 : index
	flow.return %c1, %c1, %c1 : index, index, index
	}
	builtin.module {
	func.func @jit_eval_6_dispatch_0_transpose(%arg0: !flow.dispatch.tensor<writeonly:tensor<f16>>) {
	%cst = arith.constant 0.000000e+00 : f16
	%0 = tensor.empty() : tensor<f16>
	%1 = linalg.generic {indexing_maps = [#map4], iterator_types = []} outs(%0 : tensor<f16>) {
	^bb0(%out: f16):
	linalg.yield %cst : f16
	} -> tensor<f16>
	flow.dispatch.tensor.store %1, %arg0, offsets = [], sizes = [], strides = [] : tensor<f16> -> !flow.dispatch.tensor<writeonly:tensor<f16>>
	return
	}
	}
	}
	func.func @jit_eval_6() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_6() -> (%output0: tensor<f16>)"}} {
	%0 = flow.dispatch @jit_eval_6_dispatch_0::@jit_eval_6_dispatch_0_transpose() : () -> tensor<f16>
	%1 = hal.tensor.export %0 "output0" : tensor<f16> -> !hal.buffer_view
	return %1 : !hal.buffer_view
	}
	func.func @jit_eval_7() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_7() -> (%output0: tensor<f32>)"}} {
	%cst = arith.constant 0xFF800000 : f32
	%0 = flow.tensor.splat %cst : tensor<f32>
	%1 = hal.tensor.export %0 "output0" : tensor<f32> -> !hal.buffer_view
	return %1 : !hal.buffer_view
	}
	flow.executable private @jit_eval_8_dispatch_0 {
	flow.executable.export public @jit_eval_8_dispatch_0_generic_4x4_f32xf16xf16 workgroups() -> (index, index, index) {
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	flow.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @jit_eval_8_dispatch_0_generic_4x4_f32xf16xf16(%arg0: !flow.dispatch.tensor<readonly:tensor<f32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<f16>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<4x4xf16>>) {
	%false = arith.constant false
	%0 = flow.dispatch.tensor.load %arg0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
	%1 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f16>> -> tensor<f16>
	%2 = tensor.empty() : tensor<4x4xf16>
	%3 = linalg.generic {indexing_maps = [#map5, #map5, #map3], iterator_types = ["parallel", "parallel"]} ins(%0, %1 : tensor<f32>, tensor<f16>) outs(%2 : tensor<4x4xf16>) {
	^bb0(%in: f32, %in_0: f16, %out: f16):
	%4 = arith.truncf %in : f32 to f16
	%5 = linalg.index 0 : index
	%6 = linalg.index 1 : index
	%7 = arith.index_cast %5 : index to i64
	%8 = arith.index_cast %6 : index to i64
	%9 = arith.cmpi sle, %8, %7 : i64
	%10 = arith.cmpi eq, %9, %false : i1
	%11 = arith.select %10, %4, %in_0 : f16
	linalg.yield %11 : f16
	} -> tensor<4x4xf16>
	flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [4, 4], strides = [1, 1] : tensor<4x4xf16> -> !flow.dispatch.tensor<writeonly:tensor<4x4xf16>>
	return
	}
	}
	}
	func.func @jit_eval_8(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_8(%input0: tensor<f32>, %input1: tensor<f16>) -> (%output0: tensor<4x4xf16>)"}} {
	%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<f32>
	%1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<f16>
	%2 = flow.dispatch @jit_eval_8_dispatch_0::@jit_eval_8_dispatch_0_generic_4x4_f32xf16xf16(%0, %1) : (tensor<f32>, tensor<f16>) -> tensor<4x4xf16>
	%3 = hal.tensor.export %2 "output0" : tensor<4x4xf16> -> !hal.buffer_view
	return %3 : !hal.buffer_view
	}
	}