AmosLewis · December 13, 2023 20:13
diff --git a/chatglm_dispatch.mlir b/chatglm_dispatch.mlir
 (shark.venv) ➜  SHARK git:(main) ✗ iree-compile chatglm-6b-int4.mlir --iree-input-type=tm_tensor --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=llvm-cpu --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --mlir-pass-pipeline-crash-reproducer=/nodclouddata/chi/src/SHARK/nan/dispatch/2/tmp/core-reproducer.mlir --iree-llvmcpu-target-cpu-features=host --iree-llvmcpu-target-triple=x86_64-linux-gnu --iree-llvmcpu-enable-ukernels --iree-llvmcpu-stack-allocation-limit=256000 --iree-global-opt-enable-quantized-matmul-reassociation --iree-stream-resource-max-allocation-size=4294967295 --iree-vm-bytecode-module-strip-source-map=true --iree-util-zero-fill-elided-attrs --iree-opt-strip-assertions=false --verify=true --iree-flow-break-dispatch=@forward:9 --iree-flow-trace-dispatch-tensors -mlir-print-ir-after=iree-flow-annotate-dispatches -mlir-elide-elementsattrs-if-larger=4 -o /tmp/chatglm9.vmfb

 // -----// IR Dump After AnnotateDispatches (iree-flow-annotate-dispatches) //----- //
 module attributes {hal.device.targets = [#hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "haswell", cpu_features = "-prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,-xsaves,-avx512fp16,-usermsr,-sm4,+sse4.1,-avx512ifma,+xsave,-avx512pf,+sse4.2,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,-xsavec,-avx10.1-512,-avx512vpopcntdq,+cmov,-avx512vp2intersect,-avx512cd,+movbe,-avxvnniint8,-avx512er,-amx-int8,-kl,-avx10.1-256,-sha512,-avxvnni,-rtm,-adx,+avx2,-hreset,-movdiri,-serialize,-vpclmulqdq,-avx512vl,-uintr,-clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-gfni,-avxvnniint16,-amx-fp16,+xsaveopt,+rdrnd,-avx512f,-amx-bf16,-avx512bf16,-avx512vnni,+cx8,-avx512bw,+sse3,-pku,+fsgsbase,-clzero,-mwaitx,-lwp,+lzcnt,-sha,-movdir64b,-wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,-avx512bitalg,-rdpru,-clwb,+mmx,+sse2,-rdseed,-avx512vbmi2,-prefetchi,-rdpid,-fma4,-avx512vbmi,-shstk,-vaes,-waitpkg,-sgx,+fxsr,-avx512dq,-sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "--iree-llvmcpu-stack-allocation-limit=256000"}>]}>], iree.consteval} {
  flow.executable private @jit_eval_dispatch_0 {
    flow.executable.export public @jit_eval_dispatch_0_generic_32_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice  loc(callsite("aten::div"("<eval_with_key>.5":9:10) at "aten::div"("<eval_with_key>.5":9:10)))
      flow.return %x, %y, %z : index, index, index loc(callsite("aten::div"("<eval_with_key>.5":9:10) at "aten::div"("<eval_with_key>.5":9:10)))
    } loc(callsite("aten::div"("<eval_with_key>.5":9:10) at "aten::div"("<eval_with_key>.5":9:10)))
    builtin.module {
      func.func @jit_eval_dispatch_0_generic_32_f16(%arg0: !flow.dispatch.tensor<writeonly:tensor<32xf16>> loc(callsite("aten::div"("<eval_with_key>.5":9:10) at "aten::div"("<eval_with_key>.5":9:10)))) {
        %cst = arith.constant 2.000000e+00 : f16 loc(callsite("aten::arange"("<eval_with_key>.5":8:13) at "aten::div"("<eval_with_key>.5":9:10)))
        %cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::div"("<eval_with_key>.5":9:10)))
        %cst_1 = arith.constant 6.400000e+01 : f16 loc(callsite("aten::div"("<eval_with_key>.5":9:10) at "aten::div"("<eval_with_key>.5":9:10)))
        %0 = tensor.empty() : tensor<32xf16> loc(callsite("aten::arange"("<eval_with_key>.5":8:13) at "aten::div"("<eval_with_key>.5":9:10)))
        %1 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} outs(%0 : tensor<32xf16>) {
        ^bb0(%out: f16 loc(callsite("aten::arange"("<eval_with_key>.5":8:13) at "aten::div"("<eval_with_key>.5":9:10)))):
          %2 = linalg.index 0 : index loc(callsite("aten::arange"("<eval_with_key>.5":8:13) at "aten::div"("<eval_with_key>.5":9:10)))
          %3 = arith.index_cast %2 : index to i64 loc(callsite("aten::arange"("<eval_with_key>.5":8:13) at "aten::div"("<eval_with_key>.5":9:10)))
          %4 = arith.sitofp %3 : i64 to f16 loc(callsite("aten::arange"("<eval_with_key>.5":8:13) at "aten::div"("<eval_with_key>.5":9:10)))
          %5 = arith.mulf %4, %cst : f16 loc(callsite("aten::arange"("<eval_with_key>.5":8:13) at "aten::div"("<eval_with_key>.5":9:10)))
          %6 = arith.addf %5, %cst_0 : f16 loc(callsite("aten::arange"("<eval_with_key>.5":8:13) at "aten::div"("<eval_with_key>.5":9:10)))
          %7 = arith.divf %6, %cst_1 : f16 loc(callsite("aten::div"("<eval_with_key>.5":9:10) at "aten::div"("<eval_with_key>.5":9:10)))
          linalg.yield %7 : f16 loc(callsite("aten::div"("<eval_with_key>.5":9:10) at "aten::div"("<eval_with_key>.5":9:10)))
        } -> tensor<32xf16> loc(callsite("aten::div"("<eval_with_key>.5":9:10) at "aten::div"("<eval_with_key>.5":9:10)))
        flow.dispatch.tensor.store %1, %arg0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32xf16>> loc(callsite("aten::div"("<eval_with_key>.5":9:10) at "aten::div"("<eval_with_key>.5":9:10)))
        return loc(callsite("aten::div"("<eval_with_key>.5":9:10) at "aten::div"("<eval_with_key>.5":9:10)))
      } loc(callsite("aten::div"("<eval_with_key>.5":9:10) at "aten::div"("<eval_with_key>.5":9:10)))
    } loc(callsite("aten::div"("<eval_with_key>.5":9:10) at "aten::div"("<eval_with_key>.5":9:10)))
  } loc(callsite("aten::div"("<eval_with_key>.5":9:10) at "aten::div"("<eval_with_key>.5":9:10)))
  func.func @jit_eval() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval() -> (%output0: tensor<32xf16>)"}} {
    %0 = flow.dispatch @jit_eval_dispatch_0::@jit_eval_dispatch_0_generic_32_f16() : () -> tensor<32xf16> loc(callsite("aten::div"("<eval_with_key>.5":9:10) at "aten::div"("<eval_with_key>.5":9:10)))
    %1 = hal.tensor.export %0 "output0" : tensor<32xf16> -> !hal.buffer_view loc("aten::div"("<eval_with_key>.5":9:10))
    return %1 : !hal.buffer_view loc("aten::div"("<eval_with_key>.5":9:10))
  } loc("aten::div"("<eval_with_key>.5":9:10))
  flow.executable private @jit_eval_0_dispatch_0 {
    flow.executable.export public @jit_eval_0_dispatch_0_generic_32_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice  loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
      flow.return %x, %y, %z : index, index, index loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
    } loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
    builtin.module {
      func.func @jit_eval_0_dispatch_0_generic_32_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<32xf16>> loc("aten::reciprocal"("<eval_with_key>.5":11:17)), %arg1: !flow.dispatch.tensor<writeonly:tensor<32xf16>> loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))) {
        %cst = arith.constant 1.000000e+04 : f16 loc(callsite("aten::pow"("<eval_with_key>.5":10:12) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
        %cst_0 = arith.constant 0.000000e+00 : f16 loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
        %cst_1 = arith.constant 1.000000e+00 : f16 loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xf16>> -> tensor<32xf16> loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
        %1 = tensor.empty() : tensor<32xf16> loc(callsite("aten::arange"("<eval_with_key>.5":8:13) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
        %2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<32xf16>) outs(%1 : tensor<32xf16>) {
        ^bb0(%in: f16 loc("aten::div"("<eval_with_key>.5":9:10)), %out: f16 loc("aten::reciprocal"("<eval_with_key>.5":11:17))):
          %3 = math.powf %cst, %in : f16 loc(callsite("aten::pow"("<eval_with_key>.5":10:12) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
          %4 = arith.cmpf one, %3, %cst_0 : f16 loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
          cf.assert %4, "unimplemented: tensor with zero element" loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
          %5 = arith.divf %cst_1, %3 : f16 loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
          linalg.yield %5 : f16 loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
        } -> tensor<32xf16> loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
        flow.dispatch.tensor.store %2, %arg1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32xf16>> loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
        return loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
      } loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
    } loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
  } loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
  func.func @jit_eval_0(%arg0: !hal.buffer_view loc("aten::reciprocal"("<eval_with_key>.5":11:17))) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_0(%input0: tensor<32xf16>) -> (%output0: tensor<32xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32xf16> loc("aten::reciprocal"("<eval_with_key>.5":11:17))
    %1 = flow.dispatch @jit_eval_0_dispatch_0::@jit_eval_0_dispatch_0_generic_32_f16(%0) : (tensor<32xf16>) -> tensor<32xf16> loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::reciprocal"("<eval_with_key>.5":11:17)))
    %2 = hal.tensor.export %1 "output0" : tensor<32xf16> -> !hal.buffer_view loc("aten::reciprocal"("<eval_with_key>.5":11:17))
    return %2 : !hal.buffer_view loc("aten::reciprocal"("<eval_with_key>.5":11:17))
  } loc("aten::reciprocal"("<eval_with_key>.5":11:17))
  flow.executable private @jit_eval_1_dispatch_0 {
    flow.executable.export public @jit_eval_1_dispatch_0_generic_32768_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice  loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
      flow.return %x, %y, %z : index, index, index loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
    } loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
    builtin.module {
      func.func @jit_eval_1_dispatch_0_generic_32768_f16(%arg0: !flow.dispatch.tensor<writeonly:tensor<32768xf16>> loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))) {
        %cst = arith.constant 0.000000e+00 : f16 loc(callsite("aten::reciprocal"("<eval_with_key>.5":11:17) at "aten::arange"("<eval_with_key>.5":13:15)))
        %0 = tensor.empty() : tensor<32768xf16> loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
        %1 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} outs(%0 : tensor<32768xf16>) {
        ^bb0(%out: f16 loc("aten::arange"("<eval_with_key>.5":13:15))):
          %2 = linalg.index 0 : index loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
          %3 = arith.index_cast %2 : index to i64 loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
          %4 = arith.sitofp %3 : i64 to f16 loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
          %5 = arith.addf %4, %cst : f16 loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
          linalg.yield %5 : f16 loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
        } -> tensor<32768xf16> loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
        flow.dispatch.tensor.store %1, %arg0, offsets = [0], sizes = [32768], strides = [1] : tensor<32768xf16> -> !flow.dispatch.tensor<writeonly:tensor<32768xf16>> loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
        return loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
      } loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
    } loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
  } loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
  func.func @jit_eval_1() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_1() -> (%output0: tensor<32768xf16>)"}} {
    %0 = flow.dispatch @jit_eval_1_dispatch_0::@jit_eval_1_dispatch_0_generic_32768_f16() : () -> tensor<32768xf16> loc(callsite("aten::arange"("<eval_with_key>.5":13:15) at "aten::arange"("<eval_with_key>.5":13:15)))
    %1 = hal.tensor.export %0 "output0" : tensor<32768xf16> -> !hal.buffer_view loc("aten::arange"("<eval_with_key>.5":13:15))
    return %1 : !hal.buffer_view loc("aten::arange"("<eval_with_key>.5":13:15))
  } loc("aten::arange"("<eval_with_key>.5":13:15))
  flow.executable private @jit_eval_2_dispatch_0 {
    flow.executable.export public @jit_eval_2_dispatch_0_generic_32768x32_f16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice  loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
      flow.return %x, %y, %z : index, index, index loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
    } loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
    builtin.module {
      func.func @jit_eval_2_dispatch_0_generic_32768x32_f16(%arg0: !flow.dispatch.tensor<readonly:tensor<32768xf16>> loc("aten::slice"("<eval_with_key>.5":20:14)), %arg1: !flow.dispatch.tensor<readonly:tensor<32xf16>> loc("aten::slice"("<eval_with_key>.5":20:14)), %arg2: !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>> loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14))), %arg3: !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>> loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [32768], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32768xf16>> -> tensor<32768xf16> loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xf16>> -> tensor<32xf16> loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
        %2 = tensor.empty() : tensor<32768x32xf16> loc(callsite("aten::mul"("<eval_with_key>.5":15:12) at "aten::slice"("<eval_with_key>.5":20:14)))
        %3:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0, %1 : tensor<32768xf16>, tensor<32xf16>) outs(%2, %2 : tensor<32768x32xf16>, tensor<32768x32xf16>) {
        ^bb0(%in: f16 loc("aten::view"("<eval_with_key>.5":14:11)), %in_0: f16 loc("aten::mul"("<eval_with_key>.5":12:10)), %out: f16 loc("aten::sin"("<eval_with_key>.5":17:10)), %out_1: f16 loc("aten::cos"("<eval_with_key>.5":16:10))):
          %4 = arith.mulf %in, %in_0 : f16 loc(callsite("aten::mul"("<eval_with_key>.5":15:12) at "aten::slice"("<eval_with_key>.5":20:14)))
          %5 = math.sin %4 : f16 loc(callsite("aten::sin"("<eval_with_key>.5":17:10) at "aten::slice"("<eval_with_key>.5":20:14)))
          %6 = math.cos %4 : f16 loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
          linalg.yield %5, %6 : f16, f16 loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
        } -> (tensor<32768x32xf16>, tensor<32768x32xf16>) loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
        flow.dispatch.tensor.store %3#0, %arg2, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>> loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
        flow.dispatch.tensor.store %3#1, %arg3, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<32768x32xf16>> loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
        return loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
      } loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
    } loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
  } loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
  flow.executable private @jit_eval_2_dispatch_1 {
    flow.executable.export public @jit_eval_2_dispatch_1_slow_memcpy workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice  loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
      flow.return %x, %y, %z : index, index, index loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
    } loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
    builtin.module {
      func.func @jit_eval_2_dispatch_1_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<32768x32xf16>> loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14))), %arg1: !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>> loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32768x32xf16>> -> tensor<32768x32xf16> loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
        flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0, 0], sizes = [32768, 32, 1], strides = [1, 1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>> loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
        return loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
      } loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
    } loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
  } loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
  flow.executable private @jit_eval_2_dispatch_2 {
    flow.executable.export public @jit_eval_2_dispatch_2_slow_memcpy workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice  loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
      flow.return %x, %y, %z : index, index, index loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
    } loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
    builtin.module {
      func.func @jit_eval_2_dispatch_2_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<32768x32xf16>> loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14))), %arg1: !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>> loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [32768, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32768x32xf16>> -> tensor<32768x32xf16> loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
        flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0, 1], sizes = [32768, 32, 1], strides = [1, 1, 1] : tensor<32768x32xf16> -> !flow.dispatch.tensor<readwrite:tensor<32768x32x2xf16>> loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
        return loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
      } loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
    } loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
  } loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
  func.func @jit_eval_2(%arg0: !hal.buffer_view loc("aten::slice"("<eval_with_key>.5":20:14)), %arg1: !hal.buffer_view loc("aten::slice"("<eval_with_key>.5":20:14))) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_2(%input0: tensor<32768xf16>, %input1: tensor<32xf16>) -> (%output0: tensor<4x32x2xf16>)"}} {
    %c2 = arith.constant 2 : index loc(callsite("aten::slice"("<eval_with_key>.5":20:14) at "aten::slice"("<eval_with_key>.5":20:14)))
    %c32 = arith.constant 32 : index loc(callsite("aten::slice"("<eval_with_key>.5":20:14) at "aten::slice"("<eval_with_key>.5":20:14)))
    %c4 = arith.constant 4 : index loc(callsite("aten::slice"("<eval_with_key>.5":20:14) at "aten::slice"("<eval_with_key>.5":20:14)))
    %c1 = arith.constant 1 : index loc(callsite("aten::slice"("<eval_with_key>.5":20:14) at "aten::slice"("<eval_with_key>.5":20:14)))
    %c0 = arith.constant 0 : index loc(callsite("aten::slice"("<eval_with_key>.5":20:14) at "aten::slice"("<eval_with_key>.5":20:14)))
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32768xf16> loc("aten::slice"("<eval_with_key>.5":20:14))
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<32xf16> loc("aten::slice"("<eval_with_key>.5":20:14))
    %2:2 = flow.dispatch @jit_eval_2_dispatch_0::@jit_eval_2_dispatch_0_generic_32768x32_f16(%0, %1) : (tensor<32768xf16>, tensor<32xf16>) -> (tensor<32768x32xf16>, tensor<32768x32xf16>) loc(callsite("aten::cos"("<eval_with_key>.5":16:10) at "aten::slice"("<eval_with_key>.5":20:14)))
    %3 = flow.tensor.empty : tensor<32768x32x2xf16> loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
    %4 = flow.dispatch @jit_eval_2_dispatch_1::@jit_eval_2_dispatch_1_slow_memcpy(%2#1, %3) : (tensor<32768x32xf16>, tensor<32768x32x2xf16>) -> %3 loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
    %5 = flow.dispatch @jit_eval_2_dispatch_2::@jit_eval_2_dispatch_2_slow_memcpy(%2#0, %4) : (tensor<32768x32xf16>, tensor<32768x32x2xf16>) -> %4 loc(callsite("aten::stack"("<eval_with_key>.5":18:12) at "aten::slice"("<eval_with_key>.5":20:14)))
    %6 = flow.tensor.reshape %5 : tensor<32768x32x2xf16> -> tensor<1x32768x32x2xf16> loc(callsite("aten::unsqueeze"("<eval_with_key>.5":19:16) at "aten::slice"("<eval_with_key>.5":20:14)))
    %7 = flow.tensor.slice %6[%c0, %c0, %c0, %c0 for %c1, %c4, %c32, %c2] : tensor<1x32768x32x2xf16> -> tensor<1x4x32x2xf16> loc(callsite("aten::slice"("<eval_with_key>.5":20:14) at "aten::slice"("<eval_with_key>.5":20:14)))
    %8 = flow.tensor.reshape %7 : tensor<1x4x32x2xf16> -> tensor<4x32x2xf16> loc(callsite("aten::slice"("<eval_with_key>.5":20:14) at "aten::slice"("<eval_with_key>.5":20:14)))
    %9 = hal.tensor.export %8 "output0" : tensor<4x32x2xf16> -> !hal.buffer_view loc("aten::slice"("<eval_with_key>.5":20:14))
    return %9 : !hal.buffer_view loc("aten::slice"("<eval_with_key>.5":20:14))
  } loc("aten::slice"("<eval_with_key>.5":20:14))
  flow.executable private @jit_eval_3_dispatch_0 {
    flow.executable.export public @jit_eval_3_dispatch_0_slow_memcpy workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice  loc(callsite("aten::select"("<eval_with_key>.5":54:15) at "aten::select"("<eval_with_key>.5":54:15)))
      flow.return %x, %y, %z : index, index, index loc(callsite("aten::select"("<eval_with_key>.5":54:15) at "aten::select"("<eval_with_key>.5":54:15)))
    } loc(callsite("aten::select"("<eval_with_key>.5":54:15) at "aten::select"("<eval_with_key>.5":54:15)))
    builtin.module {
      func.func @jit_eval_3_dispatch_0_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>> loc(callsite("aten::view"("<eval_with_key>.5":52:13) at "aten::select"("<eval_with_key>.5":54:15))), %arg1: !flow.dispatch.tensor<writeonly:tensor<4x32xf16>> loc(callsite("aten::select"("<eval_with_key>.5":54:15) at "aten::select"("<eval_with_key>.5":54:15)))) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0, 0], sizes = [4, 1, 1, 32, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>> -> tensor<4x32xf16> loc(callsite("aten::select"("<eval_with_key>.5":54:15) at "aten::select"("<eval_with_key>.5":54:15)))
        flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0], sizes = [4, 32], strides = [1, 1] : tensor<4x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<4x32xf16>> loc(callsite("aten::select"("<eval_with_key>.5":54:15) at "aten::select"("<eval_with_key>.5":54:15)))
        return loc(callsite("aten::select"("<eval_with_key>.5":54:15) at "aten::select"("<eval_with_key>.5":54:15)))
      } loc(callsite("aten::select"("<eval_with_key>.5":54:15) at "aten::select"("<eval_with_key>.5":54:15)))
    } loc(callsite("aten::select"("<eval_with_key>.5":54:15) at "aten::select"("<eval_with_key>.5":54:15)))
  } loc(callsite("aten::select"("<eval_with_key>.5":54:15) at "aten::select"("<eval_with_key>.5":54:15)))
  func.func @jit_eval_3(%arg0: !hal.buffer_view loc("aten::select"("<eval_with_key>.5":54:15))) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_3(%input0: tensor<4x32x2xf16>) -> (%output0: tensor<4x32xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<4x32x2xf16> loc("aten::select"("<eval_with_key>.5":54:15))
    %1 = flow.tensor.reshape %0 : tensor<4x32x2xf16> -> tensor<4x1x1x32x2xf16> loc(callsite("aten::view"("<eval_with_key>.5":52:13) at "aten::select"("<eval_with_key>.5":54:15)))
    %2 = flow.dispatch @jit_eval_3_dispatch_0::@jit_eval_3_dispatch_0_slow_memcpy(%1) : (tensor<4x1x1x32x2xf16>) -> tensor<4x32xf16> loc(callsite("aten::select"("<eval_with_key>.5":54:15) at "aten::select"("<eval_with_key>.5":54:15)))
    %3 = hal.tensor.export %2 "output0" : tensor<4x32xf16> -> !hal.buffer_view loc("aten::select"("<eval_with_key>.5":54:15))
    return %3 : !hal.buffer_view loc("aten::select"("<eval_with_key>.5":54:15))
  } loc("aten::select"("<eval_with_key>.5":54:15))
  flow.executable private @jit_eval_4_dispatch_0 {
    flow.executable.export public @jit_eval_4_dispatch_0_slow_memcpy workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice  loc(callsite("aten::select"("<eval_with_key>.5":56:15) at "aten::select"("<eval_with_key>.5":56:15)))
      flow.return %x, %y, %z : index, index, index loc(callsite("aten::select"("<eval_with_key>.5":56:15) at "aten::select"("<eval_with_key>.5":56:15)))
    } loc(callsite("aten::select"("<eval_with_key>.5":56:15) at "aten::select"("<eval_with_key>.5":56:15)))
    builtin.module {
      func.func @jit_eval_4_dispatch_0_slow_memcpy(%arg0: !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>> loc(callsite("aten::view"("<eval_with_key>.5":52:13) at "aten::select"("<eval_with_key>.5":56:15))), %arg1: !flow.dispatch.tensor<writeonly:tensor<4x32xf16>> loc(callsite("aten::select"("<eval_with_key>.5":56:15) at "aten::select"("<eval_with_key>.5":56:15)))) {
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0, 0, 1], sizes = [4, 1, 1, 32, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x1x1x32x2xf16>> -> tensor<4x32xf16> loc(callsite("aten::select"("<eval_with_key>.5":56:15) at "aten::select"("<eval_with_key>.5":56:15)))
        flow.dispatch.tensor.store %0, %arg1, offsets = [0, 0], sizes = [4, 32], strides = [1, 1] : tensor<4x32xf16> -> !flow.dispatch.tensor<writeonly:tensor<4x32xf16>> loc(callsite("aten::select"("<eval_with_key>.5":56:15) at "aten::select"("<eval_with_key>.5":56:15)))
        return loc(callsite("aten::select"("<eval_with_key>.5":56:15) at "aten::select"("<eval_with_key>.5":56:15)))
      } loc(callsite("aten::select"("<eval_with_key>.5":56:15) at "aten::select"("<eval_with_key>.5":56:15)))
    } loc(callsite("aten::select"("<eval_with_key>.5":56:15) at "aten::select"("<eval_with_key>.5":56:15)))
  } loc(callsite("aten::select"("<eval_with_key>.5":56:15) at "aten::select"("<eval_with_key>.5":56:15)))
  func.func @jit_eval_4(%arg0: !hal.buffer_view loc("aten::select"("<eval_with_key>.5":56:15))) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_4(%input0: tensor<4x32x2xf16>) -> (%output0: tensor<4x32xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<4x32x2xf16> loc("aten::select"("<eval_with_key>.5":56:15))
    %1 = flow.tensor.reshape %0 : tensor<4x32x2xf16> -> tensor<4x1x1x32x2xf16> loc(callsite("aten::view"("<eval_with_key>.5":52:13) at "aten::select"("<eval_with_key>.5":56:15)))
    %2 = flow.dispatch @jit_eval_4_dispatch_0::@jit_eval_4_dispatch_0_slow_memcpy(%1) : (tensor<4x1x1x32x2xf16>) -> tensor<4x32xf16> loc(callsite("aten::select"("<eval_with_key>.5":56:15) at "aten::select"("<eval_with_key>.5":56:15)))
    %3 = hal.tensor.export %2 "output0" : tensor<4x32xf16> -> !hal.buffer_view loc("aten::select"("<eval_with_key>.5":56:15))
    return %3 : !hal.buffer_view loc("aten::select"("<eval_with_key>.5":56:15))
  } loc("aten::select"("<eval_with_key>.5":56:15))
  flow.executable private @jit_eval_5_dispatch_0 {
    flow.executable.export public @jit_eval_5_dispatch_0_generic workgroups() -> (index, index, index) {
      %c1 = arith.constant 1 : index loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))
      flow.return %c1, %c1, %c1 : index, index, index loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))
    } loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))
    builtin.module {
      func.func @jit_eval_5_dispatch_0_generic(%arg0: !flow.dispatch.tensor<writeonly:tensor<f16>> loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))) {
        %cst = arith.constant 0.000000e+00 : f16 loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))
        %0 = tensor.empty() : tensor<f16> loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))
        %1 = linalg.generic {indexing_maps = [affine_map<() -> ()>], iterator_types = []} outs(%0 : tensor<f16>) {
        ^bb0(%out: f16 loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))):
          linalg.yield %cst : f16 loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))
        } -> tensor<f16> loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))
        flow.dispatch.tensor.store %1, %arg0, offsets = [], sizes = [], strides = [] : tensor<f16> -> !flow.dispatch.tensor<writeonly:tensor<f16>> loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))
        return loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))
      } loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))
    } loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))
  } loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))
  func.func @jit_eval_5() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_5() -> (%output0: tensor<f16>)"}} {
    %0 = flow.dispatch @jit_eval_5_dispatch_0::@jit_eval_5_dispatch_0_generic() : () -> tensor<f16> loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::zeros_like"("<eval_with_key>.5":142:17)))
    %1 = hal.tensor.export %0 "output0" : tensor<f16> -> !hal.buffer_view loc("aten::zeros_like"("<eval_with_key>.5":142:17))
    return %1 : !hal.buffer_view loc("aten::zeros_like"("<eval_with_key>.5":142:17))
  } loc("aten::zeros_like"("<eval_with_key>.5":142:17))
  func.func @jit_eval_6() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_6() -> (%output0: tensor<f32>)"}} {
    %cst = arith.constant 0xFF800000 : f32 loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
    %0 = flow.tensor.splat %cst : tensor<f32> loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
    %1 = hal.tensor.export %0 "output0" : tensor<f32> -> !hal.buffer_view loc("aten::masked_fill_"("<eval_with_key>.5":144:19))
    return %1 : !hal.buffer_view loc("aten::masked_fill_"("<eval_with_key>.5":144:19))
  } loc("aten::masked_fill_"("<eval_with_key>.5":144:19))
  flow.executable private @jit_eval_7_dispatch_0 {
    flow.executable.export public @jit_eval_7_dispatch_0_generic_4x4_f32xf16xf16 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice  loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
      flow.return %x, %y, %z : index, index, index loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
    } loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
    builtin.module {
      func.func @jit_eval_7_dispatch_0_generic_4x4_f32xf16xf16(%arg0: !flow.dispatch.tensor<readonly:tensor<f32>> loc("aten::masked_fill_"("<eval_with_key>.5":144:19)), %arg1: !flow.dispatch.tensor<readonly:tensor<f16>> loc("aten::masked_fill_"("<eval_with_key>.5":144:19)), %arg2: !flow.dispatch.tensor<writeonly:tensor<4x4xf16>> loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))) {
        %false = arith.constant false loc(callsite(unknown at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
        %0 = flow.dispatch.tensor.load %arg0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32> loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
        %1 = flow.dispatch.tensor.load %arg1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f16>> -> tensor<f16> loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
        %2 = tensor.empty() : tensor<4x4xf16> loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
        %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0, %1 : tensor<f32>, tensor<f16>) outs(%2 : tensor<4x4xf16>) {
        ^bb0(%in: f32 loc("aten::masked_fill_"("<eval_with_key>.5":144:19)), %in_0: f16 loc("aten::masked_fill_"("<eval_with_key>.5":144:19)), %out: f16 loc(callsite("aten::zeros_like"("<eval_with_key>.5":142:17) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))):
          %4 = arith.truncf %in : f32 to f16 loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
          %5 = linalg.index 0 : index loc(callsite("aten::tril"("<eval_with_key>.5":141:11) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
          %6 = linalg.index 1 : index loc(callsite("aten::tril"("<eval_with_key>.5":141:11) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
          %7 = arith.index_cast %5 : index to i64 loc(callsite("aten::tril"("<eval_with_key>.5":141:11) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
          %8 = arith.index_cast %6 : index to i64 loc(callsite("aten::tril"("<eval_with_key>.5":141:11) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
          %9 = arith.cmpi sle, %8, %7 : i64 loc(callsite("aten::tril"("<eval_with_key>.5":141:11) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
          %10 = arith.cmpi eq, %9, %false : i1 loc(callsite("aten::logical_not"("<eval_with_key>.5":143:18) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
          %11 = arith.select %10, %4, %in_0 : f16 loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
          linalg.yield %11 : f16 loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
        } -> tensor<4x4xf16> loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
        flow.dispatch.tensor.store %3, %arg2, offsets = [0, 0], sizes = [4, 4], strides = [1, 1] : tensor<4x4xf16> -> !flow.dispatch.tensor<writeonly:tensor<4x4xf16>> loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
        return loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
      } loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
    } loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
  } loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
  func.func @jit_eval_7(%arg0: !hal.buffer_view loc("aten::masked_fill_"("<eval_with_key>.5":144:19)), %arg1: !hal.buffer_view loc("aten::masked_fill_"("<eval_with_key>.5":144:19))) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_7(%input0: tensor<f32>, %input1: tensor<f16>) -> (%output0: tensor<4x4xf16>)"}} {
    %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<f32> loc("aten::masked_fill_"("<eval_with_key>.5":144:19))
    %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<f16> loc("aten::masked_fill_"("<eval_with_key>.5":144:19))
    %2 = flow.dispatch @jit_eval_7_dispatch_0::@jit_eval_7_dispatch_0_generic_4x4_f32xf16xf16(%0, %1) : (tensor<f32>, tensor<f16>) -> tensor<4x4xf16> loc(callsite("aten::masked_fill_"("<eval_with_key>.5":144:19) at "aten::masked_fill_"("<eval_with_key>.5":144:19)))
    %3 = hal.tensor.export %2 "output0" : tensor<4x4xf16> -> !hal.buffer_view loc("aten::masked_fill_"("<eval_with_key>.5":144:19))
    return %3 : !hal.buffer_view loc("aten::masked_fill_"("<eval_with_key>.5":144:19))
  } loc("aten::masked_fill_"("<eval_with_key>.5":144:19))
 } loc(unknown)

 === jit_eval_dispatch_0::jit_eval_dispatch_0_generic_32_f16 inputs ===

 === jit_eval_dispatch_0::jit_eval_dispatch_0_generic_32_f16 outputs ===
 32xf16=0 0.03125 0.0625 0.09375 0.125 0.15625 0.1875 0.21875 0.25 0.28125 0.3125 0.34375 0.375 0.40625 0.4375 0.46875 0.5 0.53125 0.5625 0.59375 0.625 0.65625 0.6875 0.71875 0.75 0.78125 0.8125 0.84375 0.875 0.90625 0.9375 0.96875

 === jit_eval_0_dispatch_0::jit_eval_0_dispatch_0_generic_32_f16 inputs ===
 32xf16=0 0.03125 0.0625 0.09375 0.125 0.15625 0.1875 0.21875 0.25 0.28125 0.3125 0.34375 0.375 0.40625 0.4375 0.46875 0.5 0.53125 0.5625 0.59375 0.625 0.65625 0.6875 0.71875 0.75 0.78125 0.8125 0.84375 0.875 0.90625 0.9375 0.96875

 === jit_eval_0_dispatch_0::jit_eval_0_dispatch_0_generic_32_f16 outputs ===
 32xf16=-NAN 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

 === jit_eval_1_dispatch_0::jit_eval_1_dispatch_0_generic_32768_f16 inputs ===